2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/inv_txfm.h"
18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
19 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20 0.5 shifts per pixel. */
22 tran_low_t output[16];
23 tran_high_t a1, b1, c1, d1, e1;
24 const tran_low_t *ip = input;
25 tran_low_t *op = output;
27 for (i = 0; i < 4; i++) {
28 a1 = ip[0] >> UNIT_QUANT_SHIFT;
29 c1 = ip[1] >> UNIT_QUANT_SHIFT;
30 d1 = ip[2] >> UNIT_QUANT_SHIFT;
31 b1 = ip[3] >> UNIT_QUANT_SHIFT;
48 for (i = 0; i < 4; i++) {
60 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
74 const tran_low_t *ip = in;
77 a1 = ip[0] >> UNIT_QUANT_SHIFT;
81 op[1] = op[2] = op[3] = WRAPLOW(e1);
84 for (i = 0; i < 4; i++) {
87 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
88 dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1);
89 dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1);
90 dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1);
96 void iadst4_c(const tran_low_t *input, tran_low_t *output) {
97 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
98 tran_low_t x0 = input[0];
99 tran_low_t x1 = input[1];
100 tran_low_t x2 = input[2];
101 tran_low_t x3 = input[3];
103 if (!(x0 | x1 | x2 | x3)) {
104 memset(output, 0, 4 * sizeof(*output));
115 s7 = WRAPLOW(x0 - x2 + x3);
122 // 1-D transform scaling factor is sqrt(2).
123 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
124 // + 1b (addition) = 29b.
125 // Hence the output bit depth is 15b.
126 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
127 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
128 output[2] = WRAPLOW(dct_const_round_shift(s2));
129 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
132 void idct4_c(const tran_low_t *input, tran_low_t *output) {
134 tran_high_t temp1, temp2;
137 temp1 = (input[0] + input[2]) * cospi_16_64;
138 temp2 = (input[0] - input[2]) * cospi_16_64;
139 step[0] = WRAPLOW(dct_const_round_shift(temp1));
140 step[1] = WRAPLOW(dct_const_round_shift(temp2));
141 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
142 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
143 step[2] = WRAPLOW(dct_const_round_shift(temp1));
144 step[3] = WRAPLOW(dct_const_round_shift(temp2));
147 output[0] = WRAPLOW(step[0] + step[3]);
148 output[1] = WRAPLOW(step[1] + step[2]);
149 output[2] = WRAPLOW(step[1] - step[2]);
150 output[3] = WRAPLOW(step[0] - step[3]);
153 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
155 tran_low_t out[4 * 4];
156 tran_low_t *outptr = out;
157 tran_low_t temp_in[4], temp_out[4];
160 for (i = 0; i < 4; ++i) {
161 idct4_c(input, outptr);
167 for (i = 0; i < 4; ++i) {
168 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
169 idct4_c(temp_in, temp_out);
170 for (j = 0; j < 4; ++j) {
171 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
172 ROUND_POWER_OF_TWO(temp_out[j], 4));
177 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
180 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
182 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
183 a1 = ROUND_POWER_OF_TWO(out, 4);
185 for (i = 0; i < 4; i++) {
186 dest[0] = clip_pixel_add(dest[0], a1);
187 dest[1] = clip_pixel_add(dest[1], a1);
188 dest[2] = clip_pixel_add(dest[2], a1);
189 dest[3] = clip_pixel_add(dest[3], a1);
194 void iadst8_c(const tran_low_t *input, tran_low_t *output) {
195 int s0, s1, s2, s3, s4, s5, s6, s7;
196 tran_high_t x0 = input[7];
197 tran_high_t x1 = input[0];
198 tran_high_t x2 = input[5];
199 tran_high_t x3 = input[2];
200 tran_high_t x4 = input[3];
201 tran_high_t x5 = input[4];
202 tran_high_t x6 = input[1];
203 tran_high_t x7 = input[6];
205 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
206 memset(output, 0, 8 * sizeof(*output));
211 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
212 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
213 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
214 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
215 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
216 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
217 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
218 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
220 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
221 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
222 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
223 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
224 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
225 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
226 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
227 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
234 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
235 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
236 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
237 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
239 x0 = WRAPLOW(s0 + s2);
240 x1 = WRAPLOW(s1 + s3);
241 x2 = WRAPLOW(s0 - s2);
242 x3 = WRAPLOW(s1 - s3);
243 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
244 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
245 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
246 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
249 s2 = (int)(cospi_16_64 * (x2 + x3));
250 s3 = (int)(cospi_16_64 * (x2 - x3));
251 s6 = (int)(cospi_16_64 * (x6 + x7));
252 s7 = (int)(cospi_16_64 * (x6 - x7));
254 x2 = WRAPLOW(dct_const_round_shift(s2));
255 x3 = WRAPLOW(dct_const_round_shift(s3));
256 x6 = WRAPLOW(dct_const_round_shift(s6));
257 x7 = WRAPLOW(dct_const_round_shift(s7));
259 output[0] = WRAPLOW(x0);
260 output[1] = WRAPLOW(-x4);
261 output[2] = WRAPLOW(x6);
262 output[3] = WRAPLOW(-x2);
263 output[4] = WRAPLOW(x3);
264 output[5] = WRAPLOW(-x7);
265 output[6] = WRAPLOW(x5);
266 output[7] = WRAPLOW(-x1);
269 void idct8_c(const tran_low_t *input, tran_low_t *output) {
270 tran_low_t step1[8], step2[8];
271 tran_high_t temp1, temp2;
278 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
279 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
280 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
281 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
282 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
283 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
284 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
285 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
288 temp1 = (step1[0] + step1[2]) * cospi_16_64;
289 temp2 = (step1[0] - step1[2]) * cospi_16_64;
290 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
291 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
292 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
293 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
294 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
295 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
296 step2[4] = WRAPLOW(step1[4] + step1[5]);
297 step2[5] = WRAPLOW(step1[4] - step1[5]);
298 step2[6] = WRAPLOW(-step1[6] + step1[7]);
299 step2[7] = WRAPLOW(step1[6] + step1[7]);
302 step1[0] = WRAPLOW(step2[0] + step2[3]);
303 step1[1] = WRAPLOW(step2[1] + step2[2]);
304 step1[2] = WRAPLOW(step2[1] - step2[2]);
305 step1[3] = WRAPLOW(step2[0] - step2[3]);
307 temp1 = (step2[6] - step2[5]) * cospi_16_64;
308 temp2 = (step2[5] + step2[6]) * cospi_16_64;
309 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
310 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
314 output[0] = WRAPLOW(step1[0] + step1[7]);
315 output[1] = WRAPLOW(step1[1] + step1[6]);
316 output[2] = WRAPLOW(step1[2] + step1[5]);
317 output[3] = WRAPLOW(step1[3] + step1[4]);
318 output[4] = WRAPLOW(step1[3] - step1[4]);
319 output[5] = WRAPLOW(step1[2] - step1[5]);
320 output[6] = WRAPLOW(step1[1] - step1[6]);
321 output[7] = WRAPLOW(step1[0] - step1[7]);
324 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
326 tran_low_t out[8 * 8];
327 tran_low_t *outptr = out;
328 tran_low_t temp_in[8], temp_out[8];
330 // First transform rows
331 for (i = 0; i < 8; ++i) {
332 idct8_c(input, outptr);
337 // Then transform columns
338 for (i = 0; i < 8; ++i) {
339 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
340 idct8_c(temp_in, temp_out);
341 for (j = 0; j < 8; ++j) {
342 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
343 ROUND_POWER_OF_TWO(temp_out[j], 5));
348 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
350 tran_low_t out[8 * 8] = { 0 };
351 tran_low_t *outptr = out;
352 tran_low_t temp_in[8], temp_out[8];
354 // First transform rows
355 // Only first 4 row has non-zero coefs
356 for (i = 0; i < 4; ++i) {
357 idct8_c(input, outptr);
362 // Then transform columns
363 for (i = 0; i < 8; ++i) {
364 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
365 idct8_c(temp_in, temp_out);
366 for (j = 0; j < 8; ++j) {
367 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
368 ROUND_POWER_OF_TWO(temp_out[j], 5));
373 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
376 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
378 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
379 a1 = ROUND_POWER_OF_TWO(out, 5);
380 for (j = 0; j < 8; ++j) {
381 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
386 void iadst16_c(const tran_low_t *input, tran_low_t *output) {
387 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
388 tran_high_t s9, s10, s11, s12, s13, s14, s15;
389 tran_high_t x0 = input[15];
390 tran_high_t x1 = input[0];
391 tran_high_t x2 = input[13];
392 tran_high_t x3 = input[2];
393 tran_high_t x4 = input[11];
394 tran_high_t x5 = input[4];
395 tran_high_t x6 = input[9];
396 tran_high_t x7 = input[6];
397 tran_high_t x8 = input[7];
398 tran_high_t x9 = input[8];
399 tran_high_t x10 = input[5];
400 tran_high_t x11 = input[10];
401 tran_high_t x12 = input[3];
402 tran_high_t x13 = input[12];
403 tran_high_t x14 = input[1];
404 tran_high_t x15 = input[14];
406 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
408 memset(output, 0, 16 * sizeof(*output));
413 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
414 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
415 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
416 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
417 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
418 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
419 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
420 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
421 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
422 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
423 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
424 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
425 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
426 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
427 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
428 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
430 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
431 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
432 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
433 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
434 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
435 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
436 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
437 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
438 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
439 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
440 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
441 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
442 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
443 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
444 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
445 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
456 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
457 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
458 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
459 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
460 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
461 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
462 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
463 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
465 x0 = WRAPLOW(s0 + s4);
466 x1 = WRAPLOW(s1 + s5);
467 x2 = WRAPLOW(s2 + s6);
468 x3 = WRAPLOW(s3 + s7);
469 x4 = WRAPLOW(s0 - s4);
470 x5 = WRAPLOW(s1 - s5);
471 x6 = WRAPLOW(s2 - s6);
472 x7 = WRAPLOW(s3 - s7);
473 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
474 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
475 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
476 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
477 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
478 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
479 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
480 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
487 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
488 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
489 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
490 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
495 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
496 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
497 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
498 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
500 x0 = WRAPLOW(s0 + s2);
501 x1 = WRAPLOW(s1 + s3);
502 x2 = WRAPLOW(s0 - s2);
503 x3 = WRAPLOW(s1 - s3);
504 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
505 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
506 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
507 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
508 x8 = WRAPLOW(s8 + s10);
509 x9 = WRAPLOW(s9 + s11);
510 x10 = WRAPLOW(s8 - s10);
511 x11 = WRAPLOW(s9 - s11);
512 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
513 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
514 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
515 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
518 s2 = (-cospi_16_64) * (x2 + x3);
519 s3 = cospi_16_64 * (x2 - x3);
520 s6 = cospi_16_64 * (x6 + x7);
521 s7 = cospi_16_64 * (-x6 + x7);
522 s10 = cospi_16_64 * (x10 + x11);
523 s11 = cospi_16_64 * (-x10 + x11);
524 s14 = (-cospi_16_64) * (x14 + x15);
525 s15 = cospi_16_64 * (x14 - x15);
527 x2 = WRAPLOW(dct_const_round_shift(s2));
528 x3 = WRAPLOW(dct_const_round_shift(s3));
529 x6 = WRAPLOW(dct_const_round_shift(s6));
530 x7 = WRAPLOW(dct_const_round_shift(s7));
531 x10 = WRAPLOW(dct_const_round_shift(s10));
532 x11 = WRAPLOW(dct_const_round_shift(s11));
533 x14 = WRAPLOW(dct_const_round_shift(s14));
534 x15 = WRAPLOW(dct_const_round_shift(s15));
536 output[0] = WRAPLOW(x0);
537 output[1] = WRAPLOW(-x8);
538 output[2] = WRAPLOW(x12);
539 output[3] = WRAPLOW(-x4);
540 output[4] = WRAPLOW(x6);
541 output[5] = WRAPLOW(x14);
542 output[6] = WRAPLOW(x10);
543 output[7] = WRAPLOW(x2);
544 output[8] = WRAPLOW(x3);
545 output[9] = WRAPLOW(x11);
546 output[10] = WRAPLOW(x15);
547 output[11] = WRAPLOW(x7);
548 output[12] = WRAPLOW(x5);
549 output[13] = WRAPLOW(-x13);
550 output[14] = WRAPLOW(x9);
551 output[15] = WRAPLOW(-x1);
554 void idct16_c(const tran_low_t *input, tran_low_t *output) {
555 tran_low_t step1[16], step2[16];
556 tran_high_t temp1, temp2;
559 step1[0] = input[0 / 2];
560 step1[1] = input[16 / 2];
561 step1[2] = input[8 / 2];
562 step1[3] = input[24 / 2];
563 step1[4] = input[4 / 2];
564 step1[5] = input[20 / 2];
565 step1[6] = input[12 / 2];
566 step1[7] = input[28 / 2];
567 step1[8] = input[2 / 2];
568 step1[9] = input[18 / 2];
569 step1[10] = input[10 / 2];
570 step1[11] = input[26 / 2];
571 step1[12] = input[6 / 2];
572 step1[13] = input[22 / 2];
573 step1[14] = input[14 / 2];
574 step1[15] = input[30 / 2];
586 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
587 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
588 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
589 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
591 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
592 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
593 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
594 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
596 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
597 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
598 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
599 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
601 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
602 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
603 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
604 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
612 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
613 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
614 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
615 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
616 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
617 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
618 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
619 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
621 step1[8] = WRAPLOW(step2[8] + step2[9]);
622 step1[9] = WRAPLOW(step2[8] - step2[9]);
623 step1[10] = WRAPLOW(-step2[10] + step2[11]);
624 step1[11] = WRAPLOW(step2[10] + step2[11]);
625 step1[12] = WRAPLOW(step2[12] + step2[13]);
626 step1[13] = WRAPLOW(step2[12] - step2[13]);
627 step1[14] = WRAPLOW(-step2[14] + step2[15]);
628 step1[15] = WRAPLOW(step2[14] + step2[15]);
631 temp1 = (step1[0] + step1[1]) * cospi_16_64;
632 temp2 = (step1[0] - step1[1]) * cospi_16_64;
633 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
634 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
635 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
636 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
637 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
638 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
639 step2[4] = WRAPLOW(step1[4] + step1[5]);
640 step2[5] = WRAPLOW(step1[4] - step1[5]);
641 step2[6] = WRAPLOW(-step1[6] + step1[7]);
642 step2[7] = WRAPLOW(step1[6] + step1[7]);
645 step2[15] = step1[15];
646 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
647 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
648 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
649 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
650 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
651 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
652 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
653 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
654 step2[11] = step1[11];
655 step2[12] = step1[12];
658 step1[0] = WRAPLOW(step2[0] + step2[3]);
659 step1[1] = WRAPLOW(step2[1] + step2[2]);
660 step1[2] = WRAPLOW(step2[1] - step2[2]);
661 step1[3] = WRAPLOW(step2[0] - step2[3]);
663 temp1 = (step2[6] - step2[5]) * cospi_16_64;
664 temp2 = (step2[5] + step2[6]) * cospi_16_64;
665 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
666 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
669 step1[8] = WRAPLOW(step2[8] + step2[11]);
670 step1[9] = WRAPLOW(step2[9] + step2[10]);
671 step1[10] = WRAPLOW(step2[9] - step2[10]);
672 step1[11] = WRAPLOW(step2[8] - step2[11]);
673 step1[12] = WRAPLOW(-step2[12] + step2[15]);
674 step1[13] = WRAPLOW(-step2[13] + step2[14]);
675 step1[14] = WRAPLOW(step2[13] + step2[14]);
676 step1[15] = WRAPLOW(step2[12] + step2[15]);
679 step2[0] = WRAPLOW(step1[0] + step1[7]);
680 step2[1] = WRAPLOW(step1[1] + step1[6]);
681 step2[2] = WRAPLOW(step1[2] + step1[5]);
682 step2[3] = WRAPLOW(step1[3] + step1[4]);
683 step2[4] = WRAPLOW(step1[3] - step1[4]);
684 step2[5] = WRAPLOW(step1[2] - step1[5]);
685 step2[6] = WRAPLOW(step1[1] - step1[6]);
686 step2[7] = WRAPLOW(step1[0] - step1[7]);
689 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
690 temp2 = (step1[10] + step1[13]) * cospi_16_64;
691 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
692 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
693 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
694 temp2 = (step1[11] + step1[12]) * cospi_16_64;
695 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
696 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
697 step2[14] = step1[14];
698 step2[15] = step1[15];
701 output[0] = WRAPLOW(step2[0] + step2[15]);
702 output[1] = WRAPLOW(step2[1] + step2[14]);
703 output[2] = WRAPLOW(step2[2] + step2[13]);
704 output[3] = WRAPLOW(step2[3] + step2[12]);
705 output[4] = WRAPLOW(step2[4] + step2[11]);
706 output[5] = WRAPLOW(step2[5] + step2[10]);
707 output[6] = WRAPLOW(step2[6] + step2[9]);
708 output[7] = WRAPLOW(step2[7] + step2[8]);
709 output[8] = WRAPLOW(step2[7] - step2[8]);
710 output[9] = WRAPLOW(step2[6] - step2[9]);
711 output[10] = WRAPLOW(step2[5] - step2[10]);
712 output[11] = WRAPLOW(step2[4] - step2[11]);
713 output[12] = WRAPLOW(step2[3] - step2[12]);
714 output[13] = WRAPLOW(step2[2] - step2[13]);
715 output[14] = WRAPLOW(step2[1] - step2[14]);
716 output[15] = WRAPLOW(step2[0] - step2[15]);
719 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
722 tran_low_t out[16 * 16];
723 tran_low_t *outptr = out;
724 tran_low_t temp_in[16], temp_out[16];
726 // First transform rows
727 for (i = 0; i < 16; ++i) {
728 idct16_c(input, outptr);
733 // Then transform columns
734 for (i = 0; i < 16; ++i) {
735 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
736 idct16_c(temp_in, temp_out);
737 for (j = 0; j < 16; ++j) {
738 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
739 ROUND_POWER_OF_TWO(temp_out[j], 6));
744 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
747 tran_low_t out[16 * 16] = { 0 };
748 tran_low_t *outptr = out;
749 tran_low_t temp_in[16], temp_out[16];
751 // First transform rows. Since all non-zero dct coefficients are in
752 // upper-left 8x8 area, we only need to calculate first 8 rows here.
753 for (i = 0; i < 8; ++i) {
754 idct16_c(input, outptr);
759 // Then transform columns
760 for (i = 0; i < 16; ++i) {
761 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
762 idct16_c(temp_in, temp_out);
763 for (j = 0; j < 16; ++j) {
764 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
765 ROUND_POWER_OF_TWO(temp_out[j], 6));
770 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
773 tran_low_t out[16 * 16] = { 0 };
774 tran_low_t *outptr = out;
775 tran_low_t temp_in[16], temp_out[16];
777 // First transform rows. Since all non-zero dct coefficients are in
778 // upper-left 4x4 area, we only need to calculate first 4 rows here.
779 for (i = 0; i < 4; ++i) {
780 idct16_c(input, outptr);
785 // Then transform columns
786 for (i = 0; i < 16; ++i) {
787 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
788 idct16_c(temp_in, temp_out);
789 for (j = 0; j < 16; ++j) {
790 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
791 ROUND_POWER_OF_TWO(temp_out[j], 6));
796 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
799 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
801 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
802 a1 = ROUND_POWER_OF_TWO(out, 6);
803 for (j = 0; j < 16; ++j) {
804 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
809 void idct32_c(const tran_low_t *input, tran_low_t *output) {
810 tran_low_t step1[32], step2[32];
811 tran_high_t temp1, temp2;
815 step1[1] = input[16];
817 step1[3] = input[24];
819 step1[5] = input[20];
820 step1[6] = input[12];
821 step1[7] = input[28];
823 step1[9] = input[18];
824 step1[10] = input[10];
825 step1[11] = input[26];
826 step1[12] = input[6];
827 step1[13] = input[22];
828 step1[14] = input[14];
829 step1[15] = input[30];
831 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
832 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
833 step1[16] = WRAPLOW(dct_const_round_shift(temp1));
834 step1[31] = WRAPLOW(dct_const_round_shift(temp2));
836 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
837 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
838 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
839 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
841 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
842 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
843 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
844 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
846 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
847 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
848 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
849 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
851 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
852 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
853 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
854 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
856 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
857 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
858 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
859 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
861 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
862 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
863 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
864 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
866 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
867 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
868 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
869 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
881 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
882 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
883 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
884 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
886 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
887 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
888 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
889 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
891 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
892 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
893 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
894 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
896 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
897 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
898 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
899 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
901 step2[16] = WRAPLOW(step1[16] + step1[17]);
902 step2[17] = WRAPLOW(step1[16] - step1[17]);
903 step2[18] = WRAPLOW(-step1[18] + step1[19]);
904 step2[19] = WRAPLOW(step1[18] + step1[19]);
905 step2[20] = WRAPLOW(step1[20] + step1[21]);
906 step2[21] = WRAPLOW(step1[20] - step1[21]);
907 step2[22] = WRAPLOW(-step1[22] + step1[23]);
908 step2[23] = WRAPLOW(step1[22] + step1[23]);
909 step2[24] = WRAPLOW(step1[24] + step1[25]);
910 step2[25] = WRAPLOW(step1[24] - step1[25]);
911 step2[26] = WRAPLOW(-step1[26] + step1[27]);
912 step2[27] = WRAPLOW(step1[26] + step1[27]);
913 step2[28] = WRAPLOW(step1[28] + step1[29]);
914 step2[29] = WRAPLOW(step1[28] - step1[29]);
915 step2[30] = WRAPLOW(-step1[30] + step1[31]);
916 step2[31] = WRAPLOW(step1[30] + step1[31]);
924 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
925 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
926 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
927 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
928 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
929 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
930 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
931 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
933 step1[8] = WRAPLOW(step2[8] + step2[9]);
934 step1[9] = WRAPLOW(step2[8] - step2[9]);
935 step1[10] = WRAPLOW(-step2[10] + step2[11]);
936 step1[11] = WRAPLOW(step2[10] + step2[11]);
937 step1[12] = WRAPLOW(step2[12] + step2[13]);
938 step1[13] = WRAPLOW(step2[12] - step2[13]);
939 step1[14] = WRAPLOW(-step2[14] + step2[15]);
940 step1[15] = WRAPLOW(step2[14] + step2[15]);
942 step1[16] = step2[16];
943 step1[31] = step2[31];
944 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
945 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
946 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
947 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
948 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
949 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
950 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
951 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
952 step1[19] = step2[19];
953 step1[20] = step2[20];
954 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
955 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
956 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
957 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
958 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
959 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
960 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
961 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
962 step1[23] = step2[23];
963 step1[24] = step2[24];
964 step1[27] = step2[27];
965 step1[28] = step2[28];
968 temp1 = (step1[0] + step1[1]) * cospi_16_64;
969 temp2 = (step1[0] - step1[1]) * cospi_16_64;
970 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
971 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
972 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
973 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
974 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
975 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
976 step2[4] = WRAPLOW(step1[4] + step1[5]);
977 step2[5] = WRAPLOW(step1[4] - step1[5]);
978 step2[6] = WRAPLOW(-step1[6] + step1[7]);
979 step2[7] = WRAPLOW(step1[6] + step1[7]);
982 step2[15] = step1[15];
983 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
984 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
985 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
986 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
987 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
988 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
989 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
990 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
991 step2[11] = step1[11];
992 step2[12] = step1[12];
994 step2[16] = WRAPLOW(step1[16] + step1[19]);
995 step2[17] = WRAPLOW(step1[17] + step1[18]);
996 step2[18] = WRAPLOW(step1[17] - step1[18]);
997 step2[19] = WRAPLOW(step1[16] - step1[19]);
998 step2[20] = WRAPLOW(-step1[20] + step1[23]);
999 step2[21] = WRAPLOW(-step1[21] + step1[22]);
1000 step2[22] = WRAPLOW(step1[21] + step1[22]);
1001 step2[23] = WRAPLOW(step1[20] + step1[23]);
1003 step2[24] = WRAPLOW(step1[24] + step1[27]);
1004 step2[25] = WRAPLOW(step1[25] + step1[26]);
1005 step2[26] = WRAPLOW(step1[25] - step1[26]);
1006 step2[27] = WRAPLOW(step1[24] - step1[27]);
1007 step2[28] = WRAPLOW(-step1[28] + step1[31]);
1008 step2[29] = WRAPLOW(-step1[29] + step1[30]);
1009 step2[30] = WRAPLOW(step1[29] + step1[30]);
1010 step2[31] = WRAPLOW(step1[28] + step1[31]);
1013 step1[0] = WRAPLOW(step2[0] + step2[3]);
1014 step1[1] = WRAPLOW(step2[1] + step2[2]);
1015 step1[2] = WRAPLOW(step2[1] - step2[2]);
1016 step1[3] = WRAPLOW(step2[0] - step2[3]);
1017 step1[4] = step2[4];
1018 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1019 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1020 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
1021 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
1022 step1[7] = step2[7];
1024 step1[8] = WRAPLOW(step2[8] + step2[11]);
1025 step1[9] = WRAPLOW(step2[9] + step2[10]);
1026 step1[10] = WRAPLOW(step2[9] - step2[10]);
1027 step1[11] = WRAPLOW(step2[8] - step2[11]);
1028 step1[12] = WRAPLOW(-step2[12] + step2[15]);
1029 step1[13] = WRAPLOW(-step2[13] + step2[14]);
1030 step1[14] = WRAPLOW(step2[13] + step2[14]);
1031 step1[15] = WRAPLOW(step2[12] + step2[15]);
1033 step1[16] = step2[16];
1034 step1[17] = step2[17];
1035 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1036 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1037 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1038 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1039 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1040 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1041 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1042 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1043 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1044 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1045 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1046 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1047 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1048 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1049 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1050 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1051 step1[22] = step2[22];
1052 step1[23] = step2[23];
1053 step1[24] = step2[24];
1054 step1[25] = step2[25];
1055 step1[30] = step2[30];
1056 step1[31] = step2[31];
1059 step2[0] = WRAPLOW(step1[0] + step1[7]);
1060 step2[1] = WRAPLOW(step1[1] + step1[6]);
1061 step2[2] = WRAPLOW(step1[2] + step1[5]);
1062 step2[3] = WRAPLOW(step1[3] + step1[4]);
1063 step2[4] = WRAPLOW(step1[3] - step1[4]);
1064 step2[5] = WRAPLOW(step1[2] - step1[5]);
1065 step2[6] = WRAPLOW(step1[1] - step1[6]);
1066 step2[7] = WRAPLOW(step1[0] - step1[7]);
1067 step2[8] = step1[8];
1068 step2[9] = step1[9];
1069 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1070 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1071 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1072 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1073 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1074 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1075 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1076 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1077 step2[14] = step1[14];
1078 step2[15] = step1[15];
1080 step2[16] = WRAPLOW(step1[16] + step1[23]);
1081 step2[17] = WRAPLOW(step1[17] + step1[22]);
1082 step2[18] = WRAPLOW(step1[18] + step1[21]);
1083 step2[19] = WRAPLOW(step1[19] + step1[20]);
1084 step2[20] = WRAPLOW(step1[19] - step1[20]);
1085 step2[21] = WRAPLOW(step1[18] - step1[21]);
1086 step2[22] = WRAPLOW(step1[17] - step1[22]);
1087 step2[23] = WRAPLOW(step1[16] - step1[23]);
1089 step2[24] = WRAPLOW(-step1[24] + step1[31]);
1090 step2[25] = WRAPLOW(-step1[25] + step1[30]);
1091 step2[26] = WRAPLOW(-step1[26] + step1[29]);
1092 step2[27] = WRAPLOW(-step1[27] + step1[28]);
1093 step2[28] = WRAPLOW(step1[27] + step1[28]);
1094 step2[29] = WRAPLOW(step1[26] + step1[29]);
1095 step2[30] = WRAPLOW(step1[25] + step1[30]);
1096 step2[31] = WRAPLOW(step1[24] + step1[31]);
1099 step1[0] = WRAPLOW(step2[0] + step2[15]);
1100 step1[1] = WRAPLOW(step2[1] + step2[14]);
1101 step1[2] = WRAPLOW(step2[2] + step2[13]);
1102 step1[3] = WRAPLOW(step2[3] + step2[12]);
1103 step1[4] = WRAPLOW(step2[4] + step2[11]);
1104 step1[5] = WRAPLOW(step2[5] + step2[10]);
1105 step1[6] = WRAPLOW(step2[6] + step2[9]);
1106 step1[7] = WRAPLOW(step2[7] + step2[8]);
1107 step1[8] = WRAPLOW(step2[7] - step2[8]);
1108 step1[9] = WRAPLOW(step2[6] - step2[9]);
1109 step1[10] = WRAPLOW(step2[5] - step2[10]);
1110 step1[11] = WRAPLOW(step2[4] - step2[11]);
1111 step1[12] = WRAPLOW(step2[3] - step2[12]);
1112 step1[13] = WRAPLOW(step2[2] - step2[13]);
1113 step1[14] = WRAPLOW(step2[1] - step2[14]);
1114 step1[15] = WRAPLOW(step2[0] - step2[15]);
1116 step1[16] = step2[16];
1117 step1[17] = step2[17];
1118 step1[18] = step2[18];
1119 step1[19] = step2[19];
1120 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1121 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1122 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1123 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1124 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1125 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1126 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1127 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1128 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1129 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1130 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1131 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1132 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1133 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1134 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1135 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1136 step1[28] = step2[28];
1137 step1[29] = step2[29];
1138 step1[30] = step2[30];
1139 step1[31] = step2[31];
1142 output[0] = WRAPLOW(step1[0] + step1[31]);
1143 output[1] = WRAPLOW(step1[1] + step1[30]);
1144 output[2] = WRAPLOW(step1[2] + step1[29]);
1145 output[3] = WRAPLOW(step1[3] + step1[28]);
1146 output[4] = WRAPLOW(step1[4] + step1[27]);
1147 output[5] = WRAPLOW(step1[5] + step1[26]);
1148 output[6] = WRAPLOW(step1[6] + step1[25]);
1149 output[7] = WRAPLOW(step1[7] + step1[24]);
1150 output[8] = WRAPLOW(step1[8] + step1[23]);
1151 output[9] = WRAPLOW(step1[9] + step1[22]);
1152 output[10] = WRAPLOW(step1[10] + step1[21]);
1153 output[11] = WRAPLOW(step1[11] + step1[20]);
1154 output[12] = WRAPLOW(step1[12] + step1[19]);
1155 output[13] = WRAPLOW(step1[13] + step1[18]);
1156 output[14] = WRAPLOW(step1[14] + step1[17]);
1157 output[15] = WRAPLOW(step1[15] + step1[16]);
1158 output[16] = WRAPLOW(step1[15] - step1[16]);
1159 output[17] = WRAPLOW(step1[14] - step1[17]);
1160 output[18] = WRAPLOW(step1[13] - step1[18]);
1161 output[19] = WRAPLOW(step1[12] - step1[19]);
1162 output[20] = WRAPLOW(step1[11] - step1[20]);
1163 output[21] = WRAPLOW(step1[10] - step1[21]);
1164 output[22] = WRAPLOW(step1[9] - step1[22]);
1165 output[23] = WRAPLOW(step1[8] - step1[23]);
1166 output[24] = WRAPLOW(step1[7] - step1[24]);
1167 output[25] = WRAPLOW(step1[6] - step1[25]);
1168 output[26] = WRAPLOW(step1[5] - step1[26]);
1169 output[27] = WRAPLOW(step1[4] - step1[27]);
1170 output[28] = WRAPLOW(step1[3] - step1[28]);
1171 output[29] = WRAPLOW(step1[2] - step1[29]);
1172 output[30] = WRAPLOW(step1[1] - step1[30]);
1173 output[31] = WRAPLOW(step1[0] - step1[31]);
1176 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
1179 tran_low_t out[32 * 32];
1180 tran_low_t *outptr = out;
1181 tran_low_t temp_in[32], temp_out[32];
1184 for (i = 0; i < 32; ++i) {
1185 int16_t zero_coeff[16];
1186 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1187 for (j = 0; j < 8; ++j)
1188 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1189 for (j = 0; j < 4; ++j)
1190 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1191 for (j = 0; j < 2; ++j)
1192 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1194 if (zero_coeff[0] | zero_coeff[1])
1195 idct32_c(input, outptr);
1197 memset(outptr, 0, sizeof(tran_low_t) * 32);
1203 for (i = 0; i < 32; ++i) {
1204 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1205 idct32_c(temp_in, temp_out);
1206 for (j = 0; j < 32; ++j) {
1207 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1208 ROUND_POWER_OF_TWO(temp_out[j], 6));
1213 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
1216 tran_low_t out[32 * 32] = { 0 };
1217 tran_low_t *outptr = out;
1218 tran_low_t temp_in[32], temp_out[32];
1221 // Only upper-left 16x16 has non-zero coeff
1222 for (i = 0; i < 16; ++i) {
1223 idct32_c(input, outptr);
1229 for (i = 0; i < 32; ++i) {
1230 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1231 idct32_c(temp_in, temp_out);
1232 for (j = 0; j < 32; ++j) {
1233 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1234 ROUND_POWER_OF_TWO(temp_out[j], 6));
1239 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
1242 tran_low_t out[32 * 32] = { 0 };
1243 tran_low_t *outptr = out;
1244 tran_low_t temp_in[32], temp_out[32];
1247 // Only upper-left 8x8 has non-zero coeff
1248 for (i = 0; i < 8; ++i) {
1249 idct32_c(input, outptr);
1255 for (i = 0; i < 32; ++i) {
1256 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
1257 idct32_c(temp_in, temp_out);
1258 for (j = 0; j < 32; ++j) {
1259 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1260 ROUND_POWER_OF_TWO(temp_out[j], 6));
1265 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
1268 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1270 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1271 a1 = ROUND_POWER_OF_TWO(out, 6);
1273 for (j = 0; j < 32; ++j) {
1274 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1279 #if CONFIG_VP9_HIGHBITDEPTH
1281 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
1282 // transform amplify bits + 1 bit for contingency in rounding and quantizing
1283 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25)
1285 static INLINE int detect_invalid_highbd_input(const tran_low_t *input,
1288 for (i = 0; i < size; ++i)
1289 if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1;
1293 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1294 int stride, int bd) {
1295 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1296 0.5 shifts per pixel. */
1298 tran_low_t output[16];
1299 tran_high_t a1, b1, c1, d1, e1;
1300 const tran_low_t *ip = input;
1301 tran_low_t *op = output;
1302 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1304 for (i = 0; i < 4; i++) {
1305 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1306 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1307 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1308 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1311 e1 = (a1 - d1) >> 1;
1316 op[0] = HIGHBD_WRAPLOW(a1, bd);
1317 op[1] = HIGHBD_WRAPLOW(b1, bd);
1318 op[2] = HIGHBD_WRAPLOW(c1, bd);
1319 op[3] = HIGHBD_WRAPLOW(d1, bd);
1325 for (i = 0; i < 4; i++) {
1332 e1 = (a1 - d1) >> 1;
1338 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1340 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1342 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1344 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1351 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
1352 int stride, int bd) {
1356 const tran_low_t *ip = in;
1357 tran_low_t *op = tmp;
1358 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1361 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1364 op[0] = HIGHBD_WRAPLOW(a1, bd);
1365 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1368 for (i = 0; i < 4; i++) {
1371 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
1372 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd);
1373 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd);
1374 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd);
1380 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1381 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1382 tran_low_t x0 = input[0];
1383 tran_low_t x1 = input[1];
1384 tran_low_t x2 = input[2];
1385 tran_low_t x3 = input[3];
1388 if (detect_invalid_highbd_input(input, 4)) {
1389 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1390 assert(0 && "invalid highbd txfm input");
1391 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1392 memset(output, 0, sizeof(*output) * 4);
1396 if (!(x0 | x1 | x2 | x3)) {
1397 memset(output, 0, 4 * sizeof(*output));
1401 s0 = sinpi_1_9 * x0;
1402 s1 = sinpi_2_9 * x0;
1403 s2 = sinpi_3_9 * x1;
1404 s3 = sinpi_4_9 * x2;
1405 s4 = sinpi_1_9 * x2;
1406 s5 = sinpi_2_9 * x3;
1407 s6 = sinpi_4_9 * x3;
1408 s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1413 s2 = sinpi_3_9 * s7;
1415 // 1-D transform scaling factor is sqrt(2).
1416 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1417 // + 1b (addition) = 29b.
1418 // Hence the output bit depth is 15b.
1419 output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1420 output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1421 output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1422 output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
1425 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
1427 tran_high_t temp1, temp2;
1430 if (detect_invalid_highbd_input(input, 4)) {
1431 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1432 assert(0 && "invalid highbd txfm input");
1433 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1434 memset(output, 0, sizeof(*output) * 4);
1439 temp1 = (input[0] + input[2]) * cospi_16_64;
1440 temp2 = (input[0] - input[2]) * cospi_16_64;
1441 step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1442 step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1443 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1444 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
1445 step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1446 step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1449 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1450 output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1451 output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1452 output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1455 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
1456 int stride, int bd) {
1458 tran_low_t out[4 * 4];
1459 tran_low_t *outptr = out;
1460 tran_low_t temp_in[4], temp_out[4];
1461 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1464 for (i = 0; i < 4; ++i) {
1465 vpx_highbd_idct4_c(input, outptr, bd);
1471 for (i = 0; i < 4; ++i) {
1472 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
1473 vpx_highbd_idct4_c(temp_in, temp_out, bd);
1474 for (j = 0; j < 4; ++j) {
1475 dest[j * stride + i] = highbd_clip_pixel_add(
1476 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1481 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
1482 int stride, int bd) {
1486 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1487 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1489 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1490 a1 = ROUND_POWER_OF_TWO(out, 4);
1492 for (i = 0; i < 4; i++) {
1493 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1494 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1495 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1496 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1501 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1502 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1503 tran_low_t x0 = input[7];
1504 tran_low_t x1 = input[0];
1505 tran_low_t x2 = input[5];
1506 tran_low_t x3 = input[2];
1507 tran_low_t x4 = input[3];
1508 tran_low_t x5 = input[4];
1509 tran_low_t x6 = input[1];
1510 tran_low_t x7 = input[6];
1513 if (detect_invalid_highbd_input(input, 8)) {
1514 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1515 assert(0 && "invalid highbd txfm input");
1516 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1517 memset(output, 0, sizeof(*output) * 8);
1521 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1522 memset(output, 0, 8 * sizeof(*output));
1527 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1528 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1529 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1530 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1531 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1532 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1533 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1534 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1536 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1537 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1538 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1539 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1540 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1541 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1542 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1543 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
1550 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1551 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1552 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1553 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1555 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1556 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1557 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1558 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1559 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1560 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1561 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1562 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1565 s2 = cospi_16_64 * (x2 + x3);
1566 s3 = cospi_16_64 * (x2 - x3);
1567 s6 = cospi_16_64 * (x6 + x7);
1568 s7 = cospi_16_64 * (x6 - x7);
1570 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1571 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1572 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1573 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1575 output[0] = HIGHBD_WRAPLOW(x0, bd);
1576 output[1] = HIGHBD_WRAPLOW(-x4, bd);
1577 output[2] = HIGHBD_WRAPLOW(x6, bd);
1578 output[3] = HIGHBD_WRAPLOW(-x2, bd);
1579 output[4] = HIGHBD_WRAPLOW(x3, bd);
1580 output[5] = HIGHBD_WRAPLOW(-x7, bd);
1581 output[6] = HIGHBD_WRAPLOW(x5, bd);
1582 output[7] = HIGHBD_WRAPLOW(-x1, bd);
1585 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
1586 tran_low_t step1[8], step2[8];
1587 tran_high_t temp1, temp2;
1589 if (detect_invalid_highbd_input(input, 8)) {
1590 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1591 assert(0 && "invalid highbd txfm input");
1592 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1593 memset(output, 0, sizeof(*output) * 8);
1598 step1[0] = input[0];
1599 step1[2] = input[4];
1600 step1[1] = input[2];
1601 step1[3] = input[6];
1602 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1603 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
1604 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1605 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1606 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1607 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
1608 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1609 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1611 // stage 2 & stage 3 - even half
1612 vpx_highbd_idct4_c(step1, step1, bd);
1614 // stage 2 - odd half
1615 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1616 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1617 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1618 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1620 // stage 3 - odd half
1621 step1[4] = step2[4];
1622 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1623 temp2 = (step2[5] + step2[6]) * cospi_16_64;
1624 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1625 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1626 step1[7] = step2[7];
1629 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1630 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1631 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1632 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1633 output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1634 output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1635 output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1636 output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1639 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
1640 int stride, int bd) {
1642 tran_low_t out[8 * 8];
1643 tran_low_t *outptr = out;
1644 tran_low_t temp_in[8], temp_out[8];
1645 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1647 // First transform rows
1648 for (i = 0; i < 8; ++i) {
1649 vpx_highbd_idct8_c(input, outptr, bd);
1654 // Then transform columns
1655 for (i = 0; i < 8; ++i) {
1656 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1657 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1658 for (j = 0; j < 8; ++j) {
1659 dest[j * stride + i] = highbd_clip_pixel_add(
1660 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1665 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
1666 int stride, int bd) {
1668 tran_low_t out[8 * 8] = { 0 };
1669 tran_low_t *outptr = out;
1670 tran_low_t temp_in[8], temp_out[8];
1671 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1673 // First transform rows
1674 // Only first 4 row has non-zero coefs
1675 for (i = 0; i < 4; ++i) {
1676 vpx_highbd_idct8_c(input, outptr, bd);
1681 // Then transform columns
1682 for (i = 0; i < 8; ++i) {
1683 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
1684 vpx_highbd_idct8_c(temp_in, temp_out, bd);
1685 for (j = 0; j < 8; ++j) {
1686 dest[j * stride + i] = highbd_clip_pixel_add(
1687 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1692 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
1693 int stride, int bd) {
1697 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
1698 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1700 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
1701 a1 = ROUND_POWER_OF_TWO(out, 5);
1702 for (j = 0; j < 8; ++j) {
1703 for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1708 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1709 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1710 tran_high_t s9, s10, s11, s12, s13, s14, s15;
1711 tran_low_t x0 = input[15];
1712 tran_low_t x1 = input[0];
1713 tran_low_t x2 = input[13];
1714 tran_low_t x3 = input[2];
1715 tran_low_t x4 = input[11];
1716 tran_low_t x5 = input[4];
1717 tran_low_t x6 = input[9];
1718 tran_low_t x7 = input[6];
1719 tran_low_t x8 = input[7];
1720 tran_low_t x9 = input[8];
1721 tran_low_t x10 = input[5];
1722 tran_low_t x11 = input[10];
1723 tran_low_t x12 = input[3];
1724 tran_low_t x13 = input[12];
1725 tran_low_t x14 = input[1];
1726 tran_low_t x15 = input[14];
1729 if (detect_invalid_highbd_input(input, 16)) {
1730 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1731 assert(0 && "invalid highbd txfm input");
1732 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1733 memset(output, 0, sizeof(*output) * 16);
1737 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1739 memset(output, 0, 16 * sizeof(*output));
1744 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1745 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1746 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1747 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1748 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1749 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1750 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1751 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1752 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1753 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1754 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1755 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1756 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1757 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1758 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1759 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1761 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1762 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1763 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1764 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1765 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1766 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1767 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1768 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1769 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1770 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1771 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1772 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1773 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1774 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1775 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1776 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
1787 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1788 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1789 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1790 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1791 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1792 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1793 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1794 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1796 x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1797 x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1798 x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1799 x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1800 x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1801 x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1802 x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1803 x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
1804 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1805 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1806 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1807 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1808 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1809 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1810 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1811 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
1818 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1819 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1820 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1821 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1826 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1827 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1828 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1829 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1831 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1832 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1833 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1834 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
1835 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1836 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1837 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1838 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
1839 x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1840 x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1841 x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1842 x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
1843 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1844 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1845 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1846 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
1849 s2 = (-cospi_16_64) * (x2 + x3);
1850 s3 = cospi_16_64 * (x2 - x3);
1851 s6 = cospi_16_64 * (x6 + x7);
1852 s7 = cospi_16_64 * (-x6 + x7);
1853 s10 = cospi_16_64 * (x10 + x11);
1854 s11 = cospi_16_64 * (-x10 + x11);
1855 s14 = (-cospi_16_64) * (x14 + x15);
1856 s15 = cospi_16_64 * (x14 - x15);
1858 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1859 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1860 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1861 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1862 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1863 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1864 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1865 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
1867 output[0] = HIGHBD_WRAPLOW(x0, bd);
1868 output[1] = HIGHBD_WRAPLOW(-x8, bd);
1869 output[2] = HIGHBD_WRAPLOW(x12, bd);
1870 output[3] = HIGHBD_WRAPLOW(-x4, bd);
1871 output[4] = HIGHBD_WRAPLOW(x6, bd);
1872 output[5] = HIGHBD_WRAPLOW(x14, bd);
1873 output[6] = HIGHBD_WRAPLOW(x10, bd);
1874 output[7] = HIGHBD_WRAPLOW(x2, bd);
1875 output[8] = HIGHBD_WRAPLOW(x3, bd);
1876 output[9] = HIGHBD_WRAPLOW(x11, bd);
1877 output[10] = HIGHBD_WRAPLOW(x15, bd);
1878 output[11] = HIGHBD_WRAPLOW(x7, bd);
1879 output[12] = HIGHBD_WRAPLOW(x5, bd);
1880 output[13] = HIGHBD_WRAPLOW(-x13, bd);
1881 output[14] = HIGHBD_WRAPLOW(x9, bd);
1882 output[15] = HIGHBD_WRAPLOW(-x1, bd);
1885 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
1886 tran_low_t step1[16], step2[16];
1887 tran_high_t temp1, temp2;
1890 if (detect_invalid_highbd_input(input, 16)) {
1891 #if CONFIG_COEFFICIENT_RANGE_CHECKING
1892 assert(0 && "invalid highbd txfm input");
1893 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
1894 memset(output, 0, sizeof(*output) * 16);
1899 step1[0] = input[0 / 2];
1900 step1[1] = input[16 / 2];
1901 step1[2] = input[8 / 2];
1902 step1[3] = input[24 / 2];
1903 step1[4] = input[4 / 2];
1904 step1[5] = input[20 / 2];
1905 step1[6] = input[12 / 2];
1906 step1[7] = input[28 / 2];
1907 step1[8] = input[2 / 2];
1908 step1[9] = input[18 / 2];
1909 step1[10] = input[10 / 2];
1910 step1[11] = input[26 / 2];
1911 step1[12] = input[6 / 2];
1912 step1[13] = input[22 / 2];
1913 step1[14] = input[14 / 2];
1914 step1[15] = input[30 / 2];
1917 step2[0] = step1[0];
1918 step2[1] = step1[1];
1919 step2[2] = step1[2];
1920 step2[3] = step1[3];
1921 step2[4] = step1[4];
1922 step2[5] = step1[5];
1923 step2[6] = step1[6];
1924 step2[7] = step1[7];
1926 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1927 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
1928 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1929 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1931 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1932 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
1933 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1934 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1936 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1937 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
1938 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1939 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1941 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1942 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
1943 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1944 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1947 step1[0] = step2[0];
1948 step1[1] = step2[1];
1949 step1[2] = step2[2];
1950 step1[3] = step2[3];
1952 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1953 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
1954 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1955 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1956 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1957 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
1958 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1959 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1961 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1962 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1963 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1964 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1965 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1966 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1967 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1968 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1971 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1972 temp2 = (step1[0] - step1[1]) * cospi_16_64;
1973 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1974 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1975 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1976 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
1977 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1978 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1979 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1980 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1981 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1982 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1984 step2[8] = step1[8];
1985 step2[15] = step1[15];
1986 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1987 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
1988 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1989 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1990 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1991 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
1992 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1993 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
1994 step2[11] = step1[11];
1995 step2[12] = step1[12];
1998 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1999 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2000 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2001 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2002 step1[4] = step2[4];
2003 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2004 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2005 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2006 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2007 step1[7] = step2[7];
2009 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2010 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2011 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2012 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2013 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2014 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2015 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2016 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2019 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2020 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2021 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2022 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2023 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2024 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2025 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2026 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2027 step2[8] = step1[8];
2028 step2[9] = step1[9];
2029 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2030 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2031 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2032 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2033 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2034 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2035 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2036 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2037 step2[14] = step1[14];
2038 step2[15] = step1[15];
2041 output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2042 output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2043 output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2044 output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2045 output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2046 output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2047 output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2048 output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2049 output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2050 output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2051 output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2052 output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2053 output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2054 output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2055 output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2056 output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2059 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
2060 int stride, int bd) {
2062 tran_low_t out[16 * 16];
2063 tran_low_t *outptr = out;
2064 tran_low_t temp_in[16], temp_out[16];
2065 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2067 // First transform rows
2068 for (i = 0; i < 16; ++i) {
2069 vpx_highbd_idct16_c(input, outptr, bd);
2074 // Then transform columns
2075 for (i = 0; i < 16; ++i) {
2076 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2077 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2078 for (j = 0; j < 16; ++j) {
2079 dest[j * stride + i] = highbd_clip_pixel_add(
2080 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2085 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
2086 int stride, int bd) {
2088 tran_low_t out[16 * 16] = { 0 };
2089 tran_low_t *outptr = out;
2090 tran_low_t temp_in[16], temp_out[16];
2091 uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
2093 // First transform rows. Since all non-zero dct coefficients are in
2094 // upper-left 8x8 area, we only need to calculate first 8 rows here.
2095 for (i = 0; i < 8; ++i) {
2096 vpx_highbd_idct16_c(input, outptr, bd);
2101 // Then transform columns
2102 for (i = 0; i < 16; ++i) {
2103 uint16_t *destT = dest;
2104 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2105 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2106 for (j = 0; j < 16; ++j) {
2107 destT[i] = highbd_clip_pixel_add(destT[i],
2108 ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2114 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
2115 int stride, int bd) {
2117 tran_low_t out[16 * 16] = { 0 };
2118 tran_low_t *outptr = out;
2119 tran_low_t temp_in[16], temp_out[16];
2120 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2122 // First transform rows. Since all non-zero dct coefficients are in
2123 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2124 for (i = 0; i < 4; ++i) {
2125 vpx_highbd_idct16_c(input, outptr, bd);
2130 // Then transform columns
2131 for (i = 0; i < 16; ++i) {
2132 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2133 vpx_highbd_idct16_c(temp_in, temp_out, bd);
2134 for (j = 0; j < 16; ++j) {
2135 dest[j * stride + i] = highbd_clip_pixel_add(
2136 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2141 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
2142 int stride, int bd) {
2146 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2147 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2149 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2150 a1 = ROUND_POWER_OF_TWO(out, 6);
2151 for (j = 0; j < 16; ++j) {
2152 for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2157 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
2159 tran_low_t step1[32], step2[32];
2160 tran_high_t temp1, temp2;
2163 if (detect_invalid_highbd_input(input, 32)) {
2164 #if CONFIG_COEFFICIENT_RANGE_CHECKING
2165 assert(0 && "invalid highbd txfm input");
2166 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING
2167 memset(output, 0, sizeof(*output) * 32);
2172 step1[0] = input[0];
2173 step1[1] = input[16];
2174 step1[2] = input[8];
2175 step1[3] = input[24];
2176 step1[4] = input[4];
2177 step1[5] = input[20];
2178 step1[6] = input[12];
2179 step1[7] = input[28];
2180 step1[8] = input[2];
2181 step1[9] = input[18];
2182 step1[10] = input[10];
2183 step1[11] = input[26];
2184 step1[12] = input[6];
2185 step1[13] = input[22];
2186 step1[14] = input[14];
2187 step1[15] = input[30];
2189 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2190 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
2191 step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2192 step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2194 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2195 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
2196 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2197 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2199 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2200 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
2201 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2202 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2204 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2205 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
2206 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2207 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2209 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2210 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
2211 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2212 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2214 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2215 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
2216 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2217 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2219 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2220 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
2221 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2222 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2224 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2225 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
2226 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2227 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2230 step2[0] = step1[0];
2231 step2[1] = step1[1];
2232 step2[2] = step1[2];
2233 step2[3] = step1[3];
2234 step2[4] = step1[4];
2235 step2[5] = step1[5];
2236 step2[6] = step1[6];
2237 step2[7] = step1[7];
2239 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2240 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
2241 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2242 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2244 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2245 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
2246 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2247 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2249 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2250 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
2251 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2252 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2254 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2255 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
2256 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2257 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2259 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2260 step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2261 step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2262 step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2263 step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2264 step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2265 step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2266 step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2267 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2268 step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2269 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2270 step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2271 step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2272 step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2273 step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2274 step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2277 step1[0] = step2[0];
2278 step1[1] = step2[1];
2279 step1[2] = step2[2];
2280 step1[3] = step2[3];
2282 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2283 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
2284 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2285 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2286 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2287 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
2288 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2289 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2291 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2292 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2293 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2294 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2295 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2296 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2297 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2298 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2300 step1[16] = step2[16];
2301 step1[31] = step2[31];
2302 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2303 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
2304 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2305 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2306 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2307 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
2308 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2309 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2310 step1[19] = step2[19];
2311 step1[20] = step2[20];
2312 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2313 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
2314 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2315 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2316 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2317 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
2318 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2319 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2320 step1[23] = step2[23];
2321 step1[24] = step2[24];
2322 step1[27] = step2[27];
2323 step1[28] = step2[28];
2326 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2327 temp2 = (step1[0] - step1[1]) * cospi_16_64;
2328 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2329 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2330 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2331 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
2332 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2333 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2334 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2335 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2336 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2337 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2339 step2[8] = step1[8];
2340 step2[15] = step1[15];
2341 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2342 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
2343 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2344 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2345 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2346 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
2347 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2348 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2349 step2[11] = step1[11];
2350 step2[12] = step1[12];
2352 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2353 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2354 step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2355 step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2356 step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2357 step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2358 step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2359 step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2361 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2362 step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2363 step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2364 step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2365 step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2366 step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2367 step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2368 step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2371 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2372 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2373 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2374 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2375 step1[4] = step2[4];
2376 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2377 temp2 = (step2[5] + step2[6]) * cospi_16_64;
2378 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2379 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2380 step1[7] = step2[7];
2382 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2383 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2384 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2385 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2386 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2387 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2388 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2389 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2391 step1[16] = step2[16];
2392 step1[17] = step2[17];
2393 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2394 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
2395 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2396 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2397 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2398 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
2399 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2400 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2401 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2402 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
2403 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2404 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2405 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2406 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
2407 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2408 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2409 step1[22] = step2[22];
2410 step1[23] = step2[23];
2411 step1[24] = step2[24];
2412 step1[25] = step2[25];
2413 step1[30] = step2[30];
2414 step1[31] = step2[31];
2417 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2418 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2419 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2420 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2421 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2422 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2423 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2424 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2425 step2[8] = step1[8];
2426 step2[9] = step1[9];
2427 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2428 temp2 = (step1[10] + step1[13]) * cospi_16_64;
2429 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2430 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2431 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2432 temp2 = (step1[11] + step1[12]) * cospi_16_64;
2433 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2434 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2435 step2[14] = step1[14];
2436 step2[15] = step1[15];
2438 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2439 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2440 step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2441 step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2442 step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2443 step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2444 step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2445 step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2447 step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2448 step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2449 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2450 step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2451 step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2452 step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2453 step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2454 step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2457 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2458 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2459 step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2460 step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2461 step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2462 step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2463 step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2464 step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2465 step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2466 step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2467 step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2468 step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2469 step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2470 step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2471 step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2472 step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2474 step1[16] = step2[16];
2475 step1[17] = step2[17];
2476 step1[18] = step2[18];
2477 step1[19] = step2[19];
2478 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2479 temp2 = (step2[20] + step2[27]) * cospi_16_64;
2480 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2481 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2482 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2483 temp2 = (step2[21] + step2[26]) * cospi_16_64;
2484 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2485 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2486 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2487 temp2 = (step2[22] + step2[25]) * cospi_16_64;
2488 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2489 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2490 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2491 temp2 = (step2[23] + step2[24]) * cospi_16_64;
2492 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2493 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
2494 step1[28] = step2[28];
2495 step1[29] = step2[29];
2496 step1[30] = step2[30];
2497 step1[31] = step2[31];
2500 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2501 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2502 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2503 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2504 output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2505 output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2506 output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2507 output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2508 output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2509 output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2510 output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2511 output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2512 output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2513 output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2514 output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2515 output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2516 output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2517 output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2518 output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2519 output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2520 output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2521 output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2522 output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2523 output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2524 output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2525 output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2526 output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2527 output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2528 output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2529 output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2530 output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2531 output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2534 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
2535 int stride, int bd) {
2537 tran_low_t out[32 * 32];
2538 tran_low_t *outptr = out;
2539 tran_low_t temp_in[32], temp_out[32];
2540 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2543 for (i = 0; i < 32; ++i) {
2544 tran_low_t zero_coeff[16];
2545 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2546 for (j = 0; j < 8; ++j)
2547 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2548 for (j = 0; j < 4; ++j)
2549 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2550 for (j = 0; j < 2; ++j)
2551 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2553 if (zero_coeff[0] | zero_coeff[1])
2554 highbd_idct32_c(input, outptr, bd);
2556 memset(outptr, 0, sizeof(tran_low_t) * 32);
2562 for (i = 0; i < 32; ++i) {
2563 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2564 highbd_idct32_c(temp_in, temp_out, bd);
2565 for (j = 0; j < 32; ++j) {
2566 dest[j * stride + i] = highbd_clip_pixel_add(
2567 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2572 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest8,
2573 int stride, int bd) {
2575 tran_low_t out[32 * 32] = { 0 };
2576 tran_low_t *outptr = out;
2577 tran_low_t temp_in[32], temp_out[32];
2578 uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
2581 // Only upper-left 16x16 has non-zero coeff
2582 for (i = 0; i < 16; ++i) {
2583 highbd_idct32_c(input, outptr, bd);
2589 for (i = 0; i < 32; ++i) {
2590 uint16_t *destT = dest;
2591 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2592 highbd_idct32_c(temp_in, temp_out, bd);
2593 for (j = 0; j < 32; ++j) {
2594 destT[i] = highbd_clip_pixel_add(destT[i],
2595 ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2601 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
2602 int stride, int bd) {
2604 tran_low_t out[32 * 32] = { 0 };
2605 tran_low_t *outptr = out;
2606 tran_low_t temp_in[32], temp_out[32];
2607 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2610 // Only upper-left 8x8 has non-zero coeff
2611 for (i = 0; i < 8; ++i) {
2612 highbd_idct32_c(input, outptr, bd);
2618 for (i = 0; i < 32; ++i) {
2619 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
2620 highbd_idct32_c(temp_in, temp_out, bd);
2621 for (j = 0; j < 32; ++j) {
2622 dest[j * stride + i] = highbd_clip_pixel_add(
2623 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2628 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
2629 int stride, int bd) {
2632 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2634 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2636 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
2637 a1 = ROUND_POWER_OF_TWO(out, 6);
2639 for (j = 0; j < 32; ++j) {
2640 for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2645 #endif // CONFIG_VP9_HIGHBITDEPTH