2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "third_party/x86inc/x86inc.asm"
13 ; This file provides SSSE3 version of the inverse transformation. Part
14 ; of the functions are originally derived from the ffmpeg project.
15 ; Note that the current version applies to x86 64-bit only.
19 pw_11585x2: times 8 dw 23170
21 pw_m2404x2: times 8 dw -2404*2
22 pw_m4756x2: times 8 dw -4756*2
23 pw_m5520x2: times 8 dw -5520*2
24 pw_m8423x2: times 8 dw -8423*2
25 pw_m9102x2: times 8 dw -9102*2
26 pw_m10394x2: times 8 dw -10394*2
27 pw_m11003x2: times 8 dw -11003*2
29 pw_16364x2: times 8 dw 16364*2
30 pw_16305x2: times 8 dw 16305*2
31 pw_16207x2: times 8 dw 16207*2
32 pw_16069x2: times 8 dw 16069*2
33 pw_15893x2: times 8 dw 15893*2
34 pw_15679x2: times 8 dw 15679*2
35 pw_15426x2: times 8 dw 15426*2
36 pw_15137x2: times 8 dw 15137*2
37 pw_14811x2: times 8 dw 14811*2
38 pw_14449x2: times 8 dw 14449*2
39 pw_14053x2: times 8 dw 14053*2
40 pw_13623x2: times 8 dw 13623*2
41 pw_13160x2: times 8 dw 13160*2
42 pw_12665x2: times 8 dw 12665*2
43 pw_12140x2: times 8 dw 12140*2
44 pw__9760x2: times 8 dw 9760*2
45 pw__7723x2: times 8 dw 7723*2
46 pw__7005x2: times 8 dw 7005*2
47 pw__6270x2: times 8 dw 6270*2
48 pw__3981x2: times 8 dw 3981*2
49 pw__3196x2: times 8 dw 3196*2
50 pw__1606x2: times 8 dw 1606*2
51 pw___804x2: times 8 dw 804*2
53 pd_8192: times 4 dd 8192
57 %macro TRANSFORM_COEFFS 2
58 pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
59 pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
60 pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
63 TRANSFORM_COEFFS 6270, 15137
64 TRANSFORM_COEFFS 3196, 16069
65 TRANSFORM_COEFFS 13623, 9102
67 ; constants for 32x32_34
68 TRANSFORM_COEFFS 804, 16364
69 TRANSFORM_COEFFS 15426, 5520
70 TRANSFORM_COEFFS 3981, 15893
71 TRANSFORM_COEFFS 16207, 2404
72 TRANSFORM_COEFFS 1606, 16305
73 TRANSFORM_COEFFS 15679, 4756
74 TRANSFORM_COEFFS 11585, 11585
76 ; constants for 32x32_1024
77 TRANSFORM_COEFFS 12140, 11003
78 TRANSFORM_COEFFS 7005, 14811
79 TRANSFORM_COEFFS 14053, 8423
80 TRANSFORM_COEFFS 9760, 13160
81 TRANSFORM_COEFFS 12665, 10394
82 TRANSFORM_COEFFS 7723, 14449
84 %macro PAIR_PP_COEFFS 2
85 dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
88 %macro PAIR_MP_COEFFS 2
89 dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
92 %macro PAIR_MM_COEFFS 2
93 dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
96 PAIR_PP_COEFFS 30274, 12540
97 PAIR_PP_COEFFS 6392, 32138
98 PAIR_MP_COEFFS 18204, 27246
100 PAIR_PP_COEFFS 12540, 12540
101 PAIR_PP_COEFFS 30274, 30274
102 PAIR_PP_COEFFS 6392, 6392
103 PAIR_PP_COEFFS 32138, 32138
104 PAIR_MM_COEFFS 18204, 18204
105 PAIR_PP_COEFFS 27246, 27246
116 ; butterfly operation
117 %macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
126 %macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
127 punpckhwd m%6, m%2, m%1
128 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4]
130 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4]
135 %macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
136 punpckhwd m%6, m%2, m%1
137 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
139 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
145 %macro INTERLEAVE_2X 4
146 punpckh%1 m%4, m%2, m%3
151 %macro TRANSPOSE8X8 9
152 INTERLEAVE_2X wd, %1, %2, %9
153 INTERLEAVE_2X wd, %3, %4, %9
154 INTERLEAVE_2X wd, %5, %6, %9
155 INTERLEAVE_2X wd, %7, %8, %9
157 INTERLEAVE_2X dq, %1, %3, %9
158 INTERLEAVE_2X dq, %2, %4, %9
159 INTERLEAVE_2X dq, %5, %7, %9
160 INTERLEAVE_2X dq, %6, %8, %9
162 INTERLEAVE_2X qdq, %1, %5, %9
163 INTERLEAVE_2X qdq, %3, %7, %9
164 INTERLEAVE_2X qdq, %2, %6, %9
165 INTERLEAVE_2X qdq, %4, %8, %9
173 BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10
176 BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10
177 BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10
196 ; This macro handles 8 pixels per line
197 %macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero
204 movh m%4, [outputq + strideq]
212 movh [outputq + strideq], m%4
216 ; full inverse 8x8 2D-DCT transform
217 cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
220 mova m12, [pw_11585x2]
222 lea r3, [2 * strideq]
224 mova m0, [inputq + 0]
225 mova m1, [inputq + 16]
226 mova m2, [inputq + 32]
227 mova m3, [inputq + 48]
228 mova m4, [inputq + 64]
229 mova m5, [inputq + 80]
230 mova m6, [inputq + 96]
231 mova m7, [inputq + 112]
233 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
235 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
239 ADD_STORE_8P_2X 0, 1, 9, 10, 12
240 lea outputq, [outputq + r3]
241 ADD_STORE_8P_2X 2, 3, 9, 10, 12
242 lea outputq, [outputq + r3]
243 ADD_STORE_8P_2X 4, 5, 9, 10, 12
244 lea outputq, [outputq + r3]
245 ADD_STORE_8P_2X 6, 7, 9, 10, 12
249 ; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
250 cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
253 mova m12, [pw_11585x2]
255 lea r3, [2 * strideq]
257 mova m0, [inputq + 0]
258 mova m1, [inputq + 16]
259 mova m2, [inputq + 32]
260 mova m3, [inputq + 48]
272 punpckhqdq m10, m0, m0
274 punpckhqdq m9, m2, m2
280 pmulhrsw m2, [dpw_30274_12540]
281 pmulhrsw m1, [dpw_6392_32138]
282 pmulhrsw m3, [dpw_m18204_27246]
287 punpcklqdq m9, m3, m3
288 punpckhqdq m5, m3, m9
294 punpckhqdq m9, m1, m5
301 punpckhqdq m3, m0, m0
302 punpckhqdq m4, m1, m1
303 punpckhqdq m6, m5, m5
304 punpckhqdq m7, m2, m2
313 punpckhdq m10, m1, m6
316 punpckhqdq m1, m0, m5
318 punpckhqdq m3, m4, m10
319 punpcklqdq m2, m4, m10
323 pmulhrsw m6, m2, [dpw_30274_30274]
324 pmulhrsw m4, m2, [dpw_12540_12540]
326 pmulhrsw m7, m1, [dpw_32138_32138]
327 pmulhrsw m1, [dpw_6392_6392]
328 pmulhrsw m5, m3, [dpw_m18204_m18204]
329 pmulhrsw m3, [dpw_27246_27246]
352 ADD_STORE_8P_2X 0, 1, 9, 10, 12
353 lea outputq, [outputq + r3]
354 ADD_STORE_8P_2X 2, 3, 9, 10, 12
355 lea outputq, [outputq + r3]
356 ADD_STORE_8P_2X 4, 5, 9, 10, 12
357 lea outputq, [outputq + r3]
358 ADD_STORE_8P_2X 6, 7, 9, 10, 12
395 ; FROM idct32x32_add_neon.asm
397 ; Instead of doing the transforms stage by stage, it is done by loading
398 ; some input values and doing as many stages as possible to minimize the
399 ; storing/loading of intermediate results. To fit within registers, the
400 ; final coefficients are cut into four blocks:
401 ; BLOCK A: 16-19,28-31
402 ; BLOCK B: 20-23,24-27
403 ; BLOCK C: 8-11,12-15
405 ; Blocks A and C are straight calculation through the various stages. In
406 ; block B, further calculations are performed using the results from
407 ; block A. In block D, further calculations are performed using the results
408 ; from block C and then the final calculations are done using results from
409 ; block A and B which have been combined at the end of block B.
412 %macro IDCT32X32_34 4
413 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
415 pmulhrsw m1, [pw___804x2] ; stp1_16
417 pmulhrsw m11, [pw_16364x2] ; stp2_31
418 mova [r4 + 16 * 2], m2
420 pmulhrsw m7, [pw_15426x2] ; stp1_28
421 mova [r4 + 16 * 4], m4
422 pmulhrsw m12, [pw_m5520x2] ; stp2_19
423 mova [r4 + 16 * 6], m6
425 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
426 mova m2, m1 ; stp1_16
427 mova m0, m11 ; stp1_31
428 mova m4, m7 ; stp1_28
429 mova m15, m12 ; stp1_19
431 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
432 BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
433 BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
435 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
436 SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
437 SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
438 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
439 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
441 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
442 BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
443 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
445 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
447 pmulhrsw m5, [pw__3981x2] ; stp1_20
448 mova [stp + %4 + idx28], m12
449 mova [stp + %4 + idx29], m15
450 pmulhrsw m6, [pw_15893x2] ; stp2_27
451 mova [stp + %4 + idx30], m2
453 pmulhrsw m3, [pw_m2404x2] ; stp1_23
454 mova [stp + %4 + idx31], m11
455 pmulhrsw m2, [pw_16207x2] ; stp2_24
457 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
458 mova m13, m5 ; stp1_20
459 mova m14, m6 ; stp1_27
460 mova m15, m3 ; stp1_23
461 mova m11, m2 ; stp1_24
463 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
464 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
465 BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
467 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
468 SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
469 SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
470 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
471 SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
473 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
474 BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
475 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
477 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
478 SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
479 SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
480 SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
481 SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
482 mova [stp + %3 + idx16], m1
483 mova [stp + %3 + idx17], m0
484 mova [stp + %3 + idx18], m4
485 mova [stp + %3 + idx19], m7
487 mova m4, [stp + %4 + idx28]
488 mova m7, [stp + %4 + idx29]
489 mova m10, [stp + %4 + idx30]
490 mova m12, [stp + %4 + idx31]
491 SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
492 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
493 SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
494 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
495 mova [stp + %4 + idx28], m4
496 mova [stp + %4 + idx29], m7
497 mova [stp + %4 + idx30], m10
498 mova [stp + %4 + idx31], m12
500 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
501 %if 0 ; overflow occurs in SUM_SUB when using test streams
502 mova m10, [pw_11585x2]
504 pmulhrsw m6, m10 ; stp1_27
505 pmulhrsw m5, m10 ; stp1_20
507 pmulhrsw m13, m10 ; stp1_26
508 pmulhrsw m14, m10 ; stp1_21
510 pmulhrsw m11, m10 ; stp1_25
511 pmulhrsw m15, m10 ; stp1_22
513 pmulhrsw m2, m10 ; stp1_24
514 pmulhrsw m3, m10 ; stp1_23
516 BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
518 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
520 BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
522 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
526 mova [stp + %4 + idx24], m2
527 mova [stp + %4 + idx25], m11
528 mova [stp + %4 + idx26], m13
529 mova [stp + %4 + idx27], m6
531 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
533 ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
534 mova m0, [rsp + transposed_in + 16 * 2]
535 mova m6, [rsp + transposed_in + 16 * 6]
538 pmulhrsw m0, [pw__1606x2] ; stp1_8
539 mova [stp + %3 + idx20], m5
540 mova [stp + %3 + idx21], m14
541 pmulhrsw m1, [pw_16305x2] ; stp2_15
542 mova [stp + %3 + idx22], m15
544 pmulhrsw m7, [pw_m4756x2] ; stp2_11
545 mova [stp + %3 + idx23], m3
546 pmulhrsw m6, [pw_15679x2] ; stp1_12
548 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
550 mova m2, m1 ; stp1_15
552 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
553 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
554 mova m4, m7 ; stp1_11
555 mova m5, m6 ; stp1_12
556 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
558 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
559 SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
560 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
561 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
562 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
564 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
565 %if 0 ; overflow occurs in SUM_SUB when using test streams
566 mova m10, [pw_11585x2]
568 pmulhrsw m5, m10 ; stp1_13
569 pmulhrsw m4, m10 ; stp1_10
571 pmulhrsw m6, m10 ; stp1_12
572 pmulhrsw m7, m10 ; stp1_11
574 BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
576 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
580 ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
581 mova [stp + %2 + idx8], m0
582 mova [stp + %2 + idx9], m2
583 mova [stp + %2 + idx10], m4
584 mova [stp + %2 + idx11], m7
586 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
588 ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
590 ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
591 mova m11, [rsp + transposed_in + 16 * 4]
593 pmulhrsw m11, [pw__3196x2] ; stp1_4
594 pmulhrsw m12, [pw_16069x2] ; stp1_7
596 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
597 mova m0, [rsp + transposed_in + 16 * 0]
598 mova m10, [pw_11585x2]
599 pmulhrsw m0, m10 ; stp1_1
601 mova m14, m11 ; stp1_4
602 mova m13, m12 ; stp1_7
604 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
605 %if 0 ; overflow occurs in SUM_SUB when using test streams
607 pmulhrsw m13, m10 ; stp1_6
608 pmulhrsw m14, m10 ; stp1_5
610 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
613 mova m7, m0 ; stp1_0 = stp1_1
617 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
618 SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
619 SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
620 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
621 SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
623 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
624 SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
625 SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
626 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
627 SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
629 ; 0-3, 28-31 final stage
630 mova m15, [stp + %4 + idx30]
631 mova m10, [stp + %4 + idx31]
632 SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
633 SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
634 mova [stp + %1 + idx0], m0
635 mova [stp + %1 + idx1], m7
636 mova [stp + %4 + idx30], m15
637 mova [stp + %4 + idx31], m10
638 mova m7, [stp + %4 + idx28]
639 mova m0, [stp + %4 + idx29]
640 SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
641 SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
642 mova [stp + %1 + idx2], m2
643 mova [stp + %1 + idx3], m4
644 mova [stp + %4 + idx28], m7
645 mova [stp + %4 + idx29], m0
647 ; 12-15, 16-19 final stage
648 mova m0, [stp + %3 + idx16]
649 mova m7, [stp + %3 + idx17]
650 mova m2, [stp + %3 + idx18]
651 mova m4, [stp + %3 + idx19]
652 SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
653 SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
654 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
655 SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
656 mova [stp + %2 + idx12], m6
657 mova [stp + %2 + idx13], m5
658 mova [stp + %2 + idx14], m3
659 mova [stp + %2 + idx15], m1
660 mova [stp + %3 + idx16], m0
661 mova [stp + %3 + idx17], m7
662 mova [stp + %3 + idx18], m2
663 mova [stp + %3 + idx19], m4
665 mova m4, [stp + %2 + idx8]
666 mova m5, [stp + %2 + idx9]
667 mova m6, [stp + %2 + idx10]
668 mova m7, [stp + %2 + idx11]
669 SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
670 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
671 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
672 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
674 ; 4-7, 24-27 final stage
675 mova m0, [stp + %4 + idx27]
676 mova m1, [stp + %4 + idx26]
677 mova m2, [stp + %4 + idx25]
678 mova m3, [stp + %4 + idx24]
679 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
680 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
681 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
682 SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
683 mova [stp + %4 + idx27], m0
684 mova [stp + %4 + idx26], m1
685 mova [stp + %4 + idx25], m2
686 mova [stp + %4 + idx24], m3
687 mova [stp + %1 + idx4], m11
688 mova [stp + %1 + idx5], m14
689 mova [stp + %1 + idx6], m13
690 mova [stp + %1 + idx7], m12
692 ; 8-11, 20-23 final stage
693 mova m0, [stp + %3 + idx20]
694 mova m1, [stp + %3 + idx21]
695 mova m2, [stp + %3 + idx22]
696 mova m3, [stp + %3 + idx23]
697 SUM_SUB 7, 0, 9 ; stp1_11, stp_20
698 SUM_SUB 6, 1, 9 ; stp1_10, stp_21
699 SUM_SUB 5, 2, 9 ; stp1_9, stp_22
700 SUM_SUB 4, 3, 9 ; stp1_8, stp_23
701 mova [stp + %2 + idx8], m4
702 mova [stp + %2 + idx9], m5
703 mova [stp + %2 + idx10], m6
704 mova [stp + %2 + idx11], m7
705 mova [stp + %3 + idx20], m0
706 mova [stp + %3 + idx21], m1
707 mova [stp + %3 + idx22], m2
708 mova [stp + %3 + idx23], m3
711 %macro RECON_AND_STORE 1
717 mova m0, [stp + 16 * 32 * 0]
718 mova m1, [stp + 16 * 32 * 1]
719 mova m2, [stp + 16 * 32 * 2]
720 mova m3, [stp + 16 * 32 * 3]
731 movh m4, [outputq + 0]
732 movh m5, [outputq + 8]
733 movh m6, [outputq + 16]
734 movh m7, [outputq + 24]
745 mova [outputq + 0], m0
746 mova [outputq + 16], m2
747 lea outputq, [outputq + strideq]
749 jnz %%recon_and_store
752 %define i32x32_size 16*32*5
753 %define pass_two_start 16*32*0
754 %define transposed_in 16*32*4
755 %define pass_one_start 16*32*0
759 cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
761 lea stp, [rsp + pass_one_start]
765 lea r4, [rsp + transposed_in]
767 idct32x32_34_transpose:
769 mova m1, [r3 + 16 * 4]
770 mova m2, [r3 + 16 * 8]
771 mova m3, [r3 + 16 * 12]
772 mova m4, [r3 + 16 * 16]
773 mova m5, [r3 + 16 * 20]
774 mova m6, [r3 + 16 * 24]
775 mova m7, [r3 + 16 * 28]
777 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
779 IDCT32X32_34 16*0, 16*32, 16*64, 16*96
780 lea stp, [stp + 16 * 8]
782 lea stp, [rsp + pass_one_start]
783 lea r9, [rsp + pass_one_start]
786 lea r4, [rsp + transposed_in]
789 idct32x32_34_transpose_2:
791 mova m1, [r3 + 16 * 1]
792 mova m2, [r3 + 16 * 2]
793 mova m3, [r3 + 16 * 3]
794 mova m4, [r3 + 16 * 4]
795 mova m5, [r3 + 16 * 5]
796 mova m6, [r3 + 16 * 6]
797 mova m7, [r3 + 16 * 7]
799 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
801 IDCT32X32_34 16*0, 16*8, 16*16, 16*24
803 lea stp, [stp + 16 * 32]
808 RECON_AND_STORE pass_two_start
812 %macro IDCT32X32_135 4
813 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
814 mova m1, [rsp + transposed_in + 16 * 1]
816 pmulhrsw m1, [pw___804x2] ; stp1_16
817 pmulhrsw m11, [pw_16364x2] ; stp2_31
819 mova m7, [rsp + transposed_in + 16 * 7]
821 pmulhrsw m7, [pw_15426x2] ; stp1_28
822 pmulhrsw m12, [pw_m5520x2] ; stp2_19
824 mova m3, [rsp + transposed_in + 16 * 9]
826 pmulhrsw m3, [pw__7005x2] ; stp1_18
827 pmulhrsw m4, [pw_14811x2] ; stp2_29
829 mova m0, [rsp + transposed_in + 16 * 15]
831 pmulhrsw m0, [pw_12140x2] ; stp1_30
832 pmulhrsw m2, [pw_m11003x2] ; stp2_17
834 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
835 SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
836 SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
837 SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
838 SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
840 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
841 BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
842 BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
844 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
845 SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
846 SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
847 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
848 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
850 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
851 BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
852 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
854 mova [stp + %3 + idx16], m1
855 mova [stp + %3 + idx17], m0
856 mova [stp + %3 + idx18], m4
857 mova [stp + %3 + idx19], m7
858 mova [stp + %4 + idx28], m12
859 mova [stp + %4 + idx29], m3
860 mova [stp + %4 + idx30], m2
861 mova [stp + %4 + idx31], m11
863 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
864 mova m2, [rsp + transposed_in + 16 * 3]
866 pmulhrsw m3, [pw_m2404x2] ; stp1_23
867 pmulhrsw m2, [pw_16207x2] ; stp2_24
869 mova m5, [rsp + transposed_in + 16 * 5]
871 pmulhrsw m5, [pw__3981x2] ; stp1_20
872 pmulhrsw m6, [pw_15893x2] ; stp2_27
874 mova m14, [rsp + transposed_in + 16 * 11]
876 pmulhrsw m13, [pw_m8423x2] ; stp1_21
877 pmulhrsw m14, [pw_14053x2] ; stp2_26
879 mova m0, [rsp + transposed_in + 16 * 13]
881 pmulhrsw m0, [pw__9760x2] ; stp1_22
882 pmulhrsw m1, [pw_13160x2] ; stp2_25
884 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
885 SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
886 SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
887 SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
888 SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
890 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
891 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
892 BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
894 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
895 SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
896 SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
897 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
898 SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
900 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
901 BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
902 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
904 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
905 mova m4, [stp + %3 + idx16]
906 mova m7, [stp + %3 + idx17]
907 mova m11, [stp + %3 + idx18]
908 mova m12, [stp + %3 + idx19]
909 SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
910 SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
911 SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
912 SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
913 mova [stp + %3 + idx16], m4
914 mova [stp + %3 + idx17], m7
915 mova [stp + %3 + idx18], m11
916 mova [stp + %3 + idx19], m12
918 mova m4, [stp + %4 + idx28]
919 mova m7, [stp + %4 + idx29]
920 mova m11, [stp + %4 + idx30]
921 mova m12, [stp + %4 + idx31]
922 SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
923 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
924 SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
925 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
926 mova [stp + %4 + idx28], m4
927 mova [stp + %4 + idx29], m7
928 mova [stp + %4 + idx30], m11
929 mova [stp + %4 + idx31], m12
931 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
932 %if 0 ; overflow occurs in SUM_SUB when using test streams
933 mova m10, [pw_11585x2]
935 pmulhrsw m6, m10 ; stp1_27
936 pmulhrsw m5, m10 ; stp1_20
938 pmulhrsw m13, m10 ; stp1_26
939 pmulhrsw m14, m10 ; stp1_21
941 pmulhrsw m1, m10 ; stp1_25
942 pmulhrsw m0, m10 ; stp1_22
944 pmulhrsw m2, m10 ; stp1_25
945 pmulhrsw m3, m10 ; stp1_22
947 BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
949 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
951 BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
953 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
956 mova [stp + %3 + idx20], m5
957 mova [stp + %3 + idx21], m14
958 mova [stp + %3 + idx22], m0
959 mova [stp + %3 + idx23], m3
960 mova [stp + %4 + idx24], m2
961 mova [stp + %4 + idx25], m1
962 mova [stp + %4 + idx26], m13
963 mova [stp + %4 + idx27], m6
965 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
967 ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
968 mova m0, [rsp + transposed_in + 16 * 2]
970 pmulhrsw m0, [pw__1606x2] ; stp1_8
971 pmulhrsw m1, [pw_16305x2] ; stp2_15
973 mova m6, [rsp + transposed_in + 16 * 6]
975 pmulhrsw m7, [pw_m4756x2] ; stp2_11
976 pmulhrsw m6, [pw_15679x2] ; stp1_12
978 mova m4, [rsp + transposed_in + 16 * 10]
980 pmulhrsw m4, [pw__7723x2] ; stp1_10
981 pmulhrsw m5, [pw_14449x2] ; stp2_13
983 mova m2, [rsp + transposed_in + 16 * 14]
985 pmulhrsw m3, [pw_m10394x2] ; stp1_9
986 pmulhrsw m2, [pw_12665x2] ; stp2_14
988 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
989 SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
990 SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
991 SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
992 SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
994 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
995 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
996 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
998 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
999 SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
1000 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
1001 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
1002 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
1004 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1005 %if 0 ; overflow occurs in SUM_SUB when using test streams
1006 mova m10, [pw_11585x2]
1008 pmulhrsw m5, m10 ; stp1_13
1009 pmulhrsw m4, m10 ; stp1_10
1011 pmulhrsw m6, m10 ; stp1_12
1012 pmulhrsw m7, m10 ; stp1_11
1014 BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
1016 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
1019 ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1020 mova [stp + %2 + idx8], m0
1021 mova [stp + %2 + idx9], m2
1022 mova [stp + %2 + idx10], m4
1023 mova [stp + %2 + idx11], m7
1024 mova [stp + %2 + idx12], m6
1025 mova [stp + %2 + idx13], m5
1026 mova [stp + %2 + idx14], m3
1027 mova [stp + %2 + idx15], m1
1029 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1031 ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1033 ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1034 mova m11, [rsp + transposed_in + 16 * 4]
1036 pmulhrsw m11, [pw__3196x2] ; stp1_4
1037 pmulhrsw m12, [pw_16069x2] ; stp1_7
1039 mova m13, [rsp + transposed_in + 16 * 12]
1041 pmulhrsw m13, [pw_13623x2] ; stp1_6
1042 pmulhrsw m14, [pw_m9102x2] ; stp1_5
1044 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1045 mova m0, [rsp + transposed_in + 16 * 0]
1046 mova m2, [rsp + transposed_in + 16 * 8]
1047 pmulhrsw m0, [pw_11585x2] ; stp1_1
1049 pmulhrsw m2, [pw__6270x2] ; stp1_2
1050 pmulhrsw m3, [pw_15137x2] ; stp1_3
1052 SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
1053 SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
1055 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1056 %if 0 ; overflow occurs in SUM_SUB when using test streams
1057 mova m10, [pw_11585x2]
1059 pmulhrsw m13, m10 ; stp1_6
1060 pmulhrsw m14, m10 ; stp1_5
1062 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
1065 mova m1, m0 ; stp1_0 = stp1_1
1066 SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
1067 SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
1069 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1070 SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
1071 SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
1072 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
1073 SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
1075 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1076 mova m4, [stp + %2 + idx12]
1077 mova m5, [stp + %2 + idx13]
1078 mova m6, [stp + %2 + idx14]
1079 mova m7, [stp + %2 + idx15]
1080 SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
1081 SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
1082 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
1083 SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
1085 ; 0-3, 28-31 final stage
1086 mova m10, [stp + %4 + idx31]
1087 mova m15, [stp + %4 + idx30]
1088 SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
1089 SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
1090 mova [stp + %1 + idx0], m0
1091 mova [stp + %1 + idx1], m1
1092 mova [stp + %4 + idx31], m10
1093 mova [stp + %4 + idx30], m15
1094 mova m0, [stp + %4 + idx29]
1095 mova m1, [stp + %4 + idx28]
1096 SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
1097 SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
1098 mova [stp + %1 + idx2], m2
1099 mova [stp + %1 + idx3], m3
1100 mova [stp + %4 + idx29], m0
1101 mova [stp + %4 + idx28], m1
1103 ; 12-15, 16-19 final stage
1104 mova m0, [stp + %3 + idx16]
1105 mova m1, [stp + %3 + idx17]
1106 mova m2, [stp + %3 + idx18]
1107 mova m3, [stp + %3 + idx19]
1108 SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
1109 SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
1110 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
1111 SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
1112 mova [stp + %2 + idx12], m4
1113 mova [stp + %2 + idx13], m5
1114 mova [stp + %2 + idx14], m6
1115 mova [stp + %2 + idx15], m7
1116 mova [stp + %3 + idx16], m0
1117 mova [stp + %3 + idx17], m1
1118 mova [stp + %3 + idx18], m2
1119 mova [stp + %3 + idx19], m3
1121 mova m4, [stp + %2 + idx8]
1122 mova m5, [stp + %2 + idx9]
1123 mova m6, [stp + %2 + idx10]
1124 mova m7, [stp + %2 + idx11]
1125 SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
1126 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
1127 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
1128 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
1130 ; 4-7, 24-27 final stage
1131 mova m3, [stp + %4 + idx24]
1132 mova m2, [stp + %4 + idx25]
1133 mova m1, [stp + %4 + idx26]
1134 mova m0, [stp + %4 + idx27]
1135 SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
1136 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
1137 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
1138 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
1139 mova [stp + %4 + idx24], m3
1140 mova [stp + %4 + idx25], m2
1141 mova [stp + %4 + idx26], m1
1142 mova [stp + %4 + idx27], m0
1143 mova [stp + %1 + idx4], m11
1144 mova [stp + %1 + idx5], m14
1145 mova [stp + %1 + idx6], m13
1146 mova [stp + %1 + idx7], m12
1148 ; 8-11, 20-23 final stage
1149 mova m0, [stp + %3 + idx20]
1150 mova m1, [stp + %3 + idx21]
1151 mova m2, [stp + %3 + idx22]
1152 mova m3, [stp + %3 + idx23]
1153 SUM_SUB 7, 0, 9 ; stp1_11, stp_20
1154 SUM_SUB 6, 1, 9 ; stp1_10, stp_21
1155 SUM_SUB 5, 2, 9 ; stp1_9, stp_22
1156 SUM_SUB 4, 3, 9 ; stp1_8, stp_23
1157 mova [stp + %2 + idx8], m4
1158 mova [stp + %2 + idx9], m5
1159 mova [stp + %2 + idx10], m6
1160 mova [stp + %2 + idx11], m7
1161 mova [stp + %3 + idx20], m0
1162 mova [stp + %3 + idx21], m1
1163 mova [stp + %3 + idx22], m2
1164 mova [stp + %3 + idx23], m3
1168 cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
1171 lea stp, [rsp + pass_one_start]
1175 lea r4, [rsp + transposed_in]
1178 idct32x32_135_transpose:
1180 mova m1, [r3 + 16 * 4]
1181 mova m2, [r3 + 16 * 8]
1182 mova m3, [r3 + 16 * 12]
1183 mova m4, [r3 + 16 * 16]
1184 mova m5, [r3 + 16 * 20]
1185 mova m6, [r3 + 16 * 24]
1186 mova m7, [r3 + 16 * 28]
1188 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1191 mova [r4 + 16 * 1], m1
1192 mova [r4 + 16 * 2], m2
1193 mova [r4 + 16 * 3], m3
1194 mova [r4 + 16 * 4], m4
1195 mova [r4 + 16 * 5], m5
1196 mova [r4 + 16 * 6], m6
1197 mova [r4 + 16 * 7], m7
1202 jne idct32x32_135_transpose
1204 IDCT32X32_135 16*0, 16*32, 16*64, 16*96
1205 lea stp, [stp + 16 * 8]
1206 lea inputq, [inputq + 16 * 32]
1211 lea stp, [rsp + pass_one_start]
1212 lea r9, [rsp + pass_one_start]
1215 lea r4, [rsp + transposed_in]
1219 idct32x32_135_transpose_2:
1221 mova m1, [r3 + 16 * 1]
1222 mova m2, [r3 + 16 * 2]
1223 mova m3, [r3 + 16 * 3]
1224 mova m4, [r3 + 16 * 4]
1225 mova m5, [r3 + 16 * 5]
1226 mova m6, [r3 + 16 * 6]
1227 mova m7, [r3 + 16 * 7]
1229 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1232 mova [r4 + 16 * 1], m1
1233 mova [r4 + 16 * 2], m2
1234 mova [r4 + 16 * 3], m3
1235 mova [r4 + 16 * 4], m4
1236 mova [r4 + 16 * 5], m5
1237 mova [r4 + 16 * 6], m6
1238 mova [r4 + 16 * 7], m7
1243 jne idct32x32_135_transpose_2
1245 IDCT32X32_135 16*0, 16*8, 16*16, 16*24
1247 lea stp, [stp + 16 * 32]
1252 RECON_AND_STORE pass_two_start
1256 %macro IDCT32X32_1024 4
1257 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1258 mova m1, [rsp + transposed_in + 16 * 1]
1259 mova m11, [rsp + transposed_in + 16 * 31]
1260 BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31
1262 mova m0, [rsp + transposed_in + 16 * 15]
1263 mova m2, [rsp + transposed_in + 16 * 17]
1264 BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30
1266 mova m7, [rsp + transposed_in + 16 * 7]
1267 mova m12, [rsp + transposed_in + 16 * 25]
1268 BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28
1270 mova m3, [rsp + transposed_in + 16 * 9]
1271 mova m4, [rsp + transposed_in + 16 * 23]
1272 BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29
1274 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1275 SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
1276 SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
1277 SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
1278 SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
1280 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1281 BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
1282 BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
1284 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1285 SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
1286 SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
1287 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
1288 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
1290 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1291 BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
1292 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
1294 mova [stp + %3 + idx16], m1
1295 mova [stp + %3 + idx17], m0
1296 mova [stp + %3 + idx18], m4
1297 mova [stp + %3 + idx19], m7
1298 mova [stp + %4 + idx28], m12
1299 mova [stp + %4 + idx29], m3
1300 mova [stp + %4 + idx30], m2
1301 mova [stp + %4 + idx31], m11
1303 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1304 mova m5, [rsp + transposed_in + 16 * 5]
1305 mova m6, [rsp + transposed_in + 16 * 27]
1306 BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27
1308 mova m13, [rsp + transposed_in + 16 * 21]
1309 mova m14, [rsp + transposed_in + 16 * 11]
1310 BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26
1312 mova m0, [rsp + transposed_in + 16 * 13]
1313 mova m1, [rsp + transposed_in + 16 * 19]
1314 BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25
1316 mova m2, [rsp + transposed_in + 16 * 3]
1317 mova m3, [rsp + transposed_in + 16 * 29]
1318 BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24
1320 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1321 SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
1322 SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
1323 SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
1324 SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
1326 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1327 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
1328 BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
1330 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1331 SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
1332 SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
1333 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
1334 SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
1336 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1337 BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
1338 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
1340 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1341 mova m4, [stp + %3 + idx16]
1342 mova m7, [stp + %3 + idx17]
1343 mova m11, [stp + %3 + idx18]
1344 mova m12, [stp + %3 + idx19]
1345 SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
1346 SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
1347 SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
1348 SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
1349 mova [stp + %3 + idx16], m4
1350 mova [stp + %3 + idx17], m7
1351 mova [stp + %3 + idx18], m11
1352 mova [stp + %3 + idx19], m12
1354 mova m4, [stp + %4 + idx28]
1355 mova m7, [stp + %4 + idx29]
1356 mova m11, [stp + %4 + idx30]
1357 mova m12, [stp + %4 + idx31]
1358 SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
1359 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
1360 SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
1361 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
1362 mova [stp + %4 + idx28], m4
1363 mova [stp + %4 + idx29], m7
1364 mova [stp + %4 + idx30], m11
1365 mova [stp + %4 + idx31], m12
1367 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1368 %if 0 ; overflow occurs in SUM_SUB when using test streams
1369 mova m10, [pw_11585x2]
1371 pmulhrsw m6, m10 ; stp1_27
1372 pmulhrsw m5, m10 ; stp1_20
1374 pmulhrsw m13, m10 ; stp1_26
1375 pmulhrsw m14, m10 ; stp1_21
1377 pmulhrsw m1, m10 ; stp1_25
1378 pmulhrsw m0, m10 ; stp1_22
1380 pmulhrsw m2, m10 ; stp1_25
1381 pmulhrsw m3, m10 ; stp1_22
1383 BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
1385 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
1387 BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
1389 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
1392 mova [stp + %3 + idx20], m5
1393 mova [stp + %3 + idx21], m14
1394 mova [stp + %3 + idx22], m0
1395 mova [stp + %3 + idx23], m3
1396 mova [stp + %4 + idx24], m2
1397 mova [stp + %4 + idx25], m1
1398 mova [stp + %4 + idx26], m13
1399 mova [stp + %4 + idx27], m6
1401 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1403 ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1404 mova m0, [rsp + transposed_in + 16 * 2]
1405 mova m1, [rsp + transposed_in + 16 * 30]
1406 BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15
1408 mova m2, [rsp + transposed_in + 16 * 14]
1409 mova m3, [rsp + transposed_in + 16 * 18]
1410 BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14
1412 mova m4, [rsp + transposed_in + 16 * 10]
1413 mova m5, [rsp + transposed_in + 16 * 22]
1414 BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13
1416 mova m6, [rsp + transposed_in + 16 * 6]
1417 mova m7, [rsp + transposed_in + 16 * 26]
1418 BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12
1420 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1421 SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
1422 SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
1423 SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
1424 SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
1426 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1427 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
1428 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
1430 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1431 SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
1432 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
1433 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
1434 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
1436 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1437 %if 0 ; overflow occurs in SUM_SUB when using test streams
1438 mova m10, [pw_11585x2]
1440 pmulhrsw m5, m10 ; stp1_13
1441 pmulhrsw m4, m10 ; stp1_10
1443 pmulhrsw m6, m10 ; stp1_12
1444 pmulhrsw m7, m10 ; stp1_11
1446 BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
1448 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
1451 ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1452 mova [stp + %2 + idx8], m0
1453 mova [stp + %2 + idx9], m2
1454 mova [stp + %2 + idx10], m4
1455 mova [stp + %2 + idx11], m7
1456 mova [stp + %2 + idx12], m6
1457 mova [stp + %2 + idx13], m5
1458 mova [stp + %2 + idx14], m3
1459 mova [stp + %2 + idx15], m1
1461 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1463 ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1465 ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1466 mova m11, [rsp + transposed_in + 16 * 4]
1467 mova m12, [rsp + transposed_in + 16 * 28]
1468 BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7
1470 mova m13, [rsp + transposed_in + 16 * 12]
1471 mova m14, [rsp + transposed_in + 16 * 20]
1472 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6
1474 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1475 mova m0, [rsp + transposed_in + 16 * 0]
1476 mova m1, [rsp + transposed_in + 16 * 16]
1478 %if 0 ; overflow occurs in SUM_SUB when using test streams
1479 mova m10, [pw_11585x2]
1481 pmulhrsw m0, m10 ; stp1_1
1482 pmulhrsw m1, m10 ; stp1_0
1484 BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0
1487 mova m2, [rsp + transposed_in + 16 * 8]
1488 mova m3, [rsp + transposed_in + 16 * 24]
1489 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3
1491 mova m10, [pw_11585x2]
1492 SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
1493 SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
1495 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1496 %if 0 ; overflow occurs in SUM_SUB when using test streams
1498 pmulhrsw m13, m10 ; stp1_6
1499 pmulhrsw m14, m10 ; stp1_5
1501 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
1504 SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
1505 SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
1507 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1508 SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
1509 SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
1510 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
1511 SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
1513 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1514 mova m4, [stp + %2 + idx12]
1515 mova m5, [stp + %2 + idx13]
1516 mova m6, [stp + %2 + idx14]
1517 mova m7, [stp + %2 + idx15]
1518 SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
1519 SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
1520 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
1521 SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
1523 ; 0-3, 28-31 final stage
1524 mova m10, [stp + %4 + idx31]
1525 mova m15, [stp + %4 + idx30]
1526 SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
1527 SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
1528 mova [stp + %1 + idx0], m0
1529 mova [stp + %1 + idx1], m1
1530 mova [stp + %4 + idx31], m10
1531 mova [stp + %4 + idx30], m15
1532 mova m0, [stp + %4 + idx29]
1533 mova m1, [stp + %4 + idx28]
1534 SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
1535 SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
1536 mova [stp + %1 + idx2], m2
1537 mova [stp + %1 + idx3], m3
1538 mova [stp + %4 + idx29], m0
1539 mova [stp + %4 + idx28], m1
1541 ; 12-15, 16-19 final stage
1542 mova m0, [stp + %3 + idx16]
1543 mova m1, [stp + %3 + idx17]
1544 mova m2, [stp + %3 + idx18]
1545 mova m3, [stp + %3 + idx19]
1546 SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
1547 SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
1548 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
1549 SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
1550 mova [stp + %2 + idx12], m4
1551 mova [stp + %2 + idx13], m5
1552 mova [stp + %2 + idx14], m6
1553 mova [stp + %2 + idx15], m7
1554 mova [stp + %3 + idx16], m0
1555 mova [stp + %3 + idx17], m1
1556 mova [stp + %3 + idx18], m2
1557 mova [stp + %3 + idx19], m3
1559 mova m4, [stp + %2 + idx8]
1560 mova m5, [stp + %2 + idx9]
1561 mova m6, [stp + %2 + idx10]
1562 mova m7, [stp + %2 + idx11]
1563 SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
1564 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
1565 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
1566 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
1568 ; 4-7, 24-27 final stage
1569 mova m3, [stp + %4 + idx24]
1570 mova m2, [stp + %4 + idx25]
1571 mova m1, [stp + %4 + idx26]
1572 mova m0, [stp + %4 + idx27]
1573 SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
1574 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
1575 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
1576 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
1577 mova [stp + %4 + idx24], m3
1578 mova [stp + %4 + idx25], m2
1579 mova [stp + %4 + idx26], m1
1580 mova [stp + %4 + idx27], m0
1581 mova [stp + %1 + idx4], m11
1582 mova [stp + %1 + idx5], m14
1583 mova [stp + %1 + idx6], m13
1584 mova [stp + %1 + idx7], m12
1586 ; 8-11, 20-23 final stage
1587 mova m0, [stp + %3 + idx20]
1588 mova m1, [stp + %3 + idx21]
1589 mova m2, [stp + %3 + idx22]
1590 mova m3, [stp + %3 + idx23]
1591 SUM_SUB 7, 0, 9 ; stp1_11, stp_20
1592 SUM_SUB 6, 1, 9 ; stp1_10, stp_21
1593 SUM_SUB 5, 2, 9 ; stp1_9, stp_22
1594 SUM_SUB 4, 3, 9 ; stp1_8, stp_23
1595 mova [stp + %2 + idx8], m4
1596 mova [stp + %2 + idx9], m5
1597 mova [stp + %2 + idx10], m6
1598 mova [stp + %2 + idx11], m7
1599 mova [stp + %3 + idx20], m0
1600 mova [stp + %3 + idx21], m1
1601 mova [stp + %3 + idx22], m2
1602 mova [stp + %3 + idx23], m3
1606 cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
1609 lea stp, [rsp + pass_one_start]
1613 lea r4, [rsp + transposed_in]
1616 idct32x32_1024_transpose:
1618 mova m1, [r3 + 16 * 4]
1619 mova m2, [r3 + 16 * 8]
1620 mova m3, [r3 + 16 * 12]
1621 mova m4, [r3 + 16 * 16]
1622 mova m5, [r3 + 16 * 20]
1623 mova m6, [r3 + 16 * 24]
1624 mova m7, [r3 + 16 * 28]
1626 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1629 mova [r4 + 16 * 1], m1
1630 mova [r4 + 16 * 2], m2
1631 mova [r4 + 16 * 3], m3
1632 mova [r4 + 16 * 4], m4
1633 mova [r4 + 16 * 5], m5
1634 mova [r4 + 16 * 6], m6
1635 mova [r4 + 16 * 7], m7
1640 jne idct32x32_1024_transpose
1642 IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
1644 lea stp, [stp + 16 * 8]
1645 lea inputq, [inputq + 16 * 32]
1650 lea stp, [rsp + pass_one_start]
1651 lea r9, [rsp + pass_one_start]
1654 lea r4, [rsp + transposed_in]
1658 idct32x32_1024_transpose_2:
1660 mova m1, [r3 + 16 * 1]
1661 mova m2, [r3 + 16 * 2]
1662 mova m3, [r3 + 16 * 3]
1663 mova m4, [r3 + 16 * 4]
1664 mova m5, [r3 + 16 * 5]
1665 mova m6, [r3 + 16 * 6]
1666 mova m7, [r3 + 16 * 7]
1668 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1671 mova [r4 + 16 * 1], m1
1672 mova [r4 + 16 * 2], m2
1673 mova [r4 + 16 * 3], m3
1674 mova [r4 + 16 * 4], m4
1675 mova [r4 + 16 * 5], m5
1676 mova [r4 + 16 * 6], m6
1677 mova [r4 + 16 * 7], m7
1682 jne idct32x32_1024_transpose_2
1684 IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
1686 lea stp, [stp + 16 * 32]
1689 jnz idct32x32_1024_2
1691 RECON_AND_STORE pass_two_start