From: Scott LaVarnway Date: Wed, 25 Nov 2015 18:11:29 +0000 (-0800) Subject: VPX: x86 asm version of vpx_idct32x32_1024_add() X-Git-Tag: v1.6.0~523^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0148e20c3ce875fe28d0aee92d9f041df809b1f2;p=libvpx VPX: x86 asm version of vpx_idct32x32_1024_add() Change-Id: I3ba4ede553e068bf116dce59d1317347988b3542 --- diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4de85a431..e972cd850 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -890,7 +890,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/; + specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc"; diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm index 2675eab1f..43c7ad87e 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -57,6 +57,14 @@ TRANSFORM_COEFFS 1606, 16305 TRANSFORM_COEFFS 15679, 4756 TRANSFORM_COEFFS 11585, 11585 +; constants for 32x32_1024 +TRANSFORM_COEFFS 12140, 11003 +TRANSFORM_COEFFS 7005, 14811 +TRANSFORM_COEFFS 14053, 8423 +TRANSFORM_COEFFS 9760, 13160 +TRANSFORM_COEFFS 12665, 10394 +TRANSFORM_COEFFS 7723, 14449 + %macro PAIR_PP_COEFFS 2 dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 %endmacro @@ -368,23 +376,24 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride %define idx30 16 * 6 %define idx31 16 * 7 +; FROM idct32x32_add_neon.asm +; +; Instead of doing the transforms stage by stage, it is done by loading +; some input values and doing as many stages as possible to minimize the +; storing/loading of intermediate results. To fit within registers, the +; final coefficients are cut into four blocks: +; BLOCK A: 16-19,28-31 +; BLOCK B: 20-23,24-27 +; BLOCK C: 8-11,12-15 +; BLOCK D: 0-3,4-7 +; Blocks A and C are straight calculation through the various stages. In +; block B, further calculations are performed using the results from +; block A. In block D, further calculations are performed using the results +; from block C and then the final calculations are done using results from +; block A and B which have been combined at the end of block B. +; + %macro IDCT32X32_34 4 - ; FROM idct32x32_add_neon.asm - ; - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-11,12-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - ; ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m11, m1 pmulhrsw m1, [pw___804x2] ; stp1_16 @@ -475,7 +484,7 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 + SUM_SUB 6, 5, 9 pmulhrsw m6, m10 ; stp1_27 pmulhrsw m5, m10 ; stp1_20 SUM_SUB 13, 14, 9 @@ -539,10 +548,10 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ %if 0 ; overflow occurs in SUM_SUB when using test streams mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 + SUM_SUB 5, 4, 9 pmulhrsw m5, m10 ; stp1_13 pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 + SUM_SUB 6, 7, 9 pmulhrsw m6, m10 ; stp1_12 pmulhrsw m7, m10 ; stp1_11 %else @@ -782,5 +791,444 @@ idct32x32_34_transpose_2: RECON_AND_STORE pass_two_start + RET + +%macro IDCT32X32_1024 4 + ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m1, [rsp + transposed_in + 16 * 1] + mova m11, [rsp + transposed_in + 16 * 31] + BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 + + mova m0, [rsp + transposed_in + 16 * 15] + mova m2, [rsp + transposed_in + 16 * 17] + BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 + + mova m7, [rsp + transposed_in + 16 * 7] + mova m12, [rsp + transposed_in + 16 * 25] + BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 + + mova m3, [rsp + transposed_in + 16 * 9] + mova m4, [rsp + transposed_in + 16 * 23] + BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 + + ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 + SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 + SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 + SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 + + ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 + BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 + + ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 + SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 + SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 + SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 + + ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 + BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 + + mova [stp + %3 + idx16], m1 + mova [stp + %3 + idx17], m0 + mova [stp + %3 + idx18], m4 + mova [stp + %3 + idx19], m7 + mova [stp + %4 + idx28], m12 + mova [stp + %4 + idx29], m3 + mova [stp + %4 + idx30], m2 + mova [stp + %4 + idx31], m11 + + ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m5, [rsp + transposed_in + 16 * 5] + mova m6, [rsp + transposed_in + 16 * 27] + BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 + + mova m13, [rsp + transposed_in + 16 * 21] + mova m14, [rsp + transposed_in + 16 * 11] + BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 + + mova m0, [rsp + transposed_in + 16 * 13] + mova m1, [rsp + transposed_in + 16 * 19] + BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 + + mova m2, [rsp + transposed_in + 16 * 3] + mova m3, [rsp + transposed_in + 16 * 29] + BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 + + ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 + SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 + SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 + SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 + + ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 + BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 + + ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 + SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 + SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 + SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 + + ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 + BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 + + ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %3 + idx16] + mova m7, [stp + %3 + idx17] + mova m11, [stp + %3 + idx18] + mova m12, [stp + %3 + idx19] + SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 + SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 + SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 + SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 + mova [stp + %3 + idx16], m4 + mova [stp + %3 + idx17], m7 + mova [stp + %3 + idx18], m11 + mova [stp + %3 + idx19], m12 + + mova m4, [stp + %4 + idx28] + mova m7, [stp + %4 + idx29] + mova m11, [stp + %4 + idx30] + mova m12, [stp + %4 + idx31] + SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 + SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 + SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 + SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 + mova [stp + %4 + idx28], m4 + mova [stp + %4 + idx29], m7 + mova [stp + %4 + idx30], m11 + mova [stp + %4 + idx31], m12 + + ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 6, 5, 9 + pmulhrsw m6, m10 ; stp1_27 + pmulhrsw m5, m10 ; stp1_20 + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_26 + pmulhrsw m14, m10 ; stp1_21 + SUM_SUB 1, 0, 9 + pmulhrsw m1, m10 ; stp1_25 + pmulhrsw m0, m10 ; stp1_22 + SUM_SUB 2, 3, 9 + pmulhrsw m2, m10 ; stp1_25 + pmulhrsw m3, m10 ; stp1_22 +%else + BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 + SWAP 6, 5 + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 + SWAP 13, 14 + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 + SWAP 1, 0 + BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 + SWAP 2, 3 +%endif + mova [stp + %3 + idx20], m5 + mova [stp + %3 + idx21], m14 + mova [stp + %3 + idx22], m0 + mova [stp + %3 + idx23], m3 + mova [stp + %4 + idx24], m2 + mova [stp + %4 + idx25], m1 + mova [stp + %4 + idx26], m13 + mova [stp + %4 + idx27], m6 + + ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 2] + mova m1, [rsp + transposed_in + 16 * 30] + BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 + + mova m2, [rsp + transposed_in + 16 * 14] + mova m3, [rsp + transposed_in + 16 * 18] + BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 + + mova m4, [rsp + transposed_in + 16 * 10] + mova m5, [rsp + transposed_in + 16 * 22] + BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 + + mova m6, [rsp + transposed_in + 16 * 6] + mova m7, [rsp + transposed_in + 16 * 26] + BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 + + ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 + SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 + SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 + SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 + + ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 + BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 + + ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 + SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 + SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 + SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 + + ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 5, 4, 9 + pmulhrsw m5, m10 ; stp1_13 + pmulhrsw m4, m10 ; stp1_10 + SUM_SUB 6, 7, 9 + pmulhrsw m6, m10 ; stp1_12 + pmulhrsw m7, m10 ; stp1_11 +%else + BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 + SWAP 5, 4 + BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 + SWAP 6, 7 +%endif + ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova [stp + %2 + idx8], m0 + mova [stp + %2 + idx9], m2 + mova [stp + %2 + idx10], m4 + mova [stp + %2 + idx11], m7 + mova [stp + %2 + idx12], m6 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m3 + mova [stp + %2 + idx15], m1 + + ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m11, [rsp + transposed_in + 16 * 4] + mova m12, [rsp + transposed_in + 16 * 28] + BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 + + mova m13, [rsp + transposed_in + 16 * 12] + mova m14, [rsp + transposed_in + 16 * 20] + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 + + ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 0] + mova m1, [rsp + transposed_in + 16 * 16] + +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 0, 1, 9 + pmulhrsw m0, m10 ; stp1_1 + pmulhrsw m1, m10 ; stp1_0 +%else + BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 + SWAP 0, 1 +%endif + mova m2, [rsp + transposed_in + 16 * 8] + mova m3, [rsp + transposed_in + 16 * 24] + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 + + mova m10, [pw_11585x2] + SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 + SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 + + ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_6 + pmulhrsw m14, m10 ; stp1_5 +%else + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 + SWAP 13, 14 +%endif + SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 + SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 + + ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 + SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 + SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 + SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 + + ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %2 + idx12] + mova m5, [stp + %2 + idx13] + mova m6, [stp + %2 + idx14] + mova m7, [stp + %2 + idx15] + SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 + SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 + SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 + SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 + + ; 0-3, 28-31 final stage + mova m10, [stp + %4 + idx31] + mova m15, [stp + %4 + idx30] + SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 + SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 + mova [stp + %1 + idx0], m0 + mova [stp + %1 + idx1], m1 + mova [stp + %4 + idx31], m10 + mova [stp + %4 + idx30], m15 + mova m0, [stp + %4 + idx29] + mova m1, [stp + %4 + idx28] + SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 + SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 + mova [stp + %1 + idx2], m2 + mova [stp + %1 + idx3], m3 + mova [stp + %4 + idx29], m0 + mova [stp + %4 + idx28], m1 + + ; 12-15, 16-19 final stage + mova m0, [stp + %3 + idx16] + mova m1, [stp + %3 + idx17] + mova m2, [stp + %3 + idx18] + mova m3, [stp + %3 + idx19] + SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 + SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 + SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 + SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 + mova [stp + %2 + idx12], m4 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m6 + mova [stp + %2 + idx15], m7 + mova [stp + %3 + idx16], m0 + mova [stp + %3 + idx17], m1 + mova [stp + %3 + idx18], m2 + mova [stp + %3 + idx19], m3 + + mova m4, [stp + %2 + idx8] + mova m5, [stp + %2 + idx9] + mova m6, [stp + %2 + idx10] + mova m7, [stp + %2 + idx11] + SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 + SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 + SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 + SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 + + ; 4-7, 24-27 final stage + mova m3, [stp + %4 + idx24] + mova m2, [stp + %4 + idx25] + mova m1, [stp + %4 + idx26] + mova m0, [stp + %4 + idx27] + SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 + SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 + SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 + SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 + mova [stp + %4 + idx24], m3 + mova [stp + %4 + idx25], m2 + mova [stp + %4 + idx26], m1 + mova [stp + %4 + idx27], m0 + mova [stp + %1 + idx4], m11 + mova [stp + %1 + idx5], m14 + mova [stp + %1 + idx6], m13 + mova [stp + %1 + idx7], m12 + + ; 8-11, 20-23 final stage + mova m0, [stp + %3 + idx20] + mova m1, [stp + %3 + idx21] + mova m2, [stp + %3 + idx22] + mova m3, [stp + %3 + idx23] + SUM_SUB 7, 0, 9 ; stp1_11, stp_20 + SUM_SUB 6, 1, 9 ; stp1_10, stp_21 + SUM_SUB 5, 2, 9 ; stp1_9, stp_22 + SUM_SUB 4, 3, 9 ; stp1_8, stp_23 + mova [stp + %2 + idx8], m4 + mova [stp + %2 + idx9], m5 + mova [stp + %2 + idx10], m6 + mova [stp + %2 + idx11], m7 + mova [stp + %3 + idx20], m0 + mova [stp + %3 + idx21], m1 + mova [stp + %3 + idx22], m2 + mova [stp + %3 + idx23], m3 +%endmacro + +INIT_XMM ssse3 +cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride + mova m8, [pd_8192] + mov r6, 4 + lea stp, [rsp + pass_one_start] + +idct32x32_1024: + mov r3, inputq + lea r4, [rsp + transposed_in] + mov r7, 4 + +idct32x32_1024_transpose: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 4] + mova m2, [r3 + 16 * 8] + mova m3, [r3 + 16 * 12] + mova m4, [r3 + 16 * 16] + mova m5, [r3 + 16 * 20] + mova m6, [r3 + 16 * 24] + mova m7, [r3 + 16 * 28] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 + add r4, 16 * 8 + dec r7 + jne idct32x32_1024_transpose + + IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 + + lea stp, [stp + 16 * 8] + lea inputq, [inputq + 16 * 32] + dec r6 + jnz idct32x32_1024 + + mov r6, 4 + lea stp, [rsp + pass_one_start] + lea r9, [rsp + pass_one_start] + +idct32x32_1024_2: + lea r4, [rsp + transposed_in] + mov r3, r9 + mov r7, 4 + +idct32x32_1024_transpose_2: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 1] + mova m2, [r3 + 16 * 2] + mova m3, [r3 + 16 * 3] + mova m4, [r3 + 16 * 4] + mova m5, [r3 + 16 * 5] + mova m6, [r3 + 16 * 6] + mova m7, [r3 + 16 * 7] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 * 8 + add r4, 16 * 8 + dec r7 + jne idct32x32_1024_transpose_2 + + IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 + + lea stp, [stp + 16 * 32] + add r9, 16 * 32 + dec r6 + jnz idct32x32_1024_2 + + RECON_AND_STORE pass_two_start + RET %endif