From f0b0b1fe627e760d27d9917af7c217f11c57c3ad Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 2 Dec 2015 04:50:46 -0800 Subject: [PATCH] VP9: Add ssse3 version of vpx_idct32x32_135_add() Change-Id: I9a780131efaad28cf1ad233ae64c5c319a329727 --- vp9/common/vp9_idct.c | 3 + vpx_dsp/inv_txfm.c | 27 ++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 18 + vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm | 466 +++++++++++++++++++++++++- 4 files changed, 511 insertions(+), 3 deletions(-) diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index d12cd76db..1b420143b 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -174,6 +174,9 @@ void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, else if (eob <= 34) // non-zero coeff only in upper-left 8x8 vpx_idct32x32_34_add(input, dest, stride); + else if (eob <= 135) + // non-zero coeff only in upper-left 16x16 + vpx_idct32x32_135_add(input, dest, stride); else vpx_idct32x32_1024_add(input, dest, stride); } diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c index 5f3cfddbd..a0f59bf75 100644 --- a/vpx_dsp/inv_txfm.c +++ b/vpx_dsp/inv_txfm.c @@ -1194,6 +1194,33 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, } } +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = {0}; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[32 * 32] = {0}; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index e972cd850..c4d3544be 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -754,6 +754,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1024_add/; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vpx_idct32x32_135_add/; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add/; @@ -802,6 +805,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1024_add sse2/; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vpx_idct32x32_135_add sse2/; + $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add sse2/; @@ -853,6 +860,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1024_add/; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vpx_idct32x32_135_add/; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add/; @@ -892,6 +902,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64_x86inc"; + # Need to add 135 eob idct32x32 implementations. + $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; + $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon; + $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2; + $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64_x86inc"; # Need to add 34 eob idct32x32 neon implementation. diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm index 43c7ad87e..80a330b7b 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -18,9 +18,13 @@ SECTION_RODATA pw_11585x2: times 8 dw 23170 -pw_m2404x2: times 8 dw -2404*2 -pw_m4756x2: times 8 dw -4756*2 -pw_m5520x2: times 8 dw -5520*2 +pw_m2404x2: times 8 dw -2404*2 +pw_m4756x2: times 8 dw -4756*2 +pw_m5520x2: times 8 dw -5520*2 +pw_m8423x2: times 8 dw -8423*2 +pw_m9102x2: times 8 dw -9102*2 +pw_m10394x2: times 8 dw -10394*2 +pw_m11003x2: times 8 dw -11003*2 pw_16364x2: times 8 dw 16364*2 pw_16305x2: times 8 dw 16305*2 @@ -29,6 +33,18 @@ pw_16069x2: times 8 dw 16069*2 pw_15893x2: times 8 dw 15893*2 pw_15679x2: times 8 dw 15679*2 pw_15426x2: times 8 dw 15426*2 +pw_15137x2: times 8 dw 15137*2 +pw_14811x2: times 8 dw 14811*2 +pw_14449x2: times 8 dw 14449*2 +pw_14053x2: times 8 dw 14053*2 +pw_13623x2: times 8 dw 13623*2 +pw_13160x2: times 8 dw 13160*2 +pw_12665x2: times 8 dw 12665*2 +pw_12140x2: times 8 dw 12140*2 +pw__9760x2: times 8 dw 9760*2 +pw__7723x2: times 8 dw 7723*2 +pw__7005x2: times 8 dw 7005*2 +pw__6270x2: times 8 dw 6270*2 pw__3981x2: times 8 dw 3981*2 pw__3196x2: times 8 dw 3196*2 pw__1606x2: times 8 dw 1606*2 @@ -793,6 +809,450 @@ idct32x32_34_transpose_2: RET +%macro IDCT32X32_135 4 + ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m1, [rsp + transposed_in + 16 * 1] + mova m11, m1 + pmulhrsw m1, [pw___804x2] ; stp1_16 + pmulhrsw m11, [pw_16364x2] ; stp2_31 + + mova m7, [rsp + transposed_in + 16 * 7] + mova m12, m7 + pmulhrsw m7, [pw_15426x2] ; stp1_28 + pmulhrsw m12, [pw_m5520x2] ; stp2_19 + + mova m3, [rsp + transposed_in + 16 * 9] + mova m4, m3 + pmulhrsw m3, [pw__7005x2] ; stp1_18 + pmulhrsw m4, [pw_14811x2] ; stp2_29 + + mova m0, [rsp + transposed_in + 16 * 15] + mova m2, m0 + pmulhrsw m0, [pw_12140x2] ; stp1_30 + pmulhrsw m2, [pw_m11003x2] ; stp2_17 + + ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 + SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 + SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 + SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 + + ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 + BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 + + ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 + SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 + SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 + SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 + + ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 + BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 + + mova [stp + %3 + idx16], m1 + mova [stp + %3 + idx17], m0 + mova [stp + %3 + idx18], m4 + mova [stp + %3 + idx19], m7 + mova [stp + %4 + idx28], m12 + mova [stp + %4 + idx29], m3 + mova [stp + %4 + idx30], m2 + mova [stp + %4 + idx31], m11 + + ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m2, [rsp + transposed_in + 16 * 3] + mova m3, m2 + pmulhrsw m3, [pw_m2404x2] ; stp1_23 + pmulhrsw m2, [pw_16207x2] ; stp2_24 + + mova m5, [rsp + transposed_in + 16 * 5] + mova m6, m5 + pmulhrsw m5, [pw__3981x2] ; stp1_20 + pmulhrsw m6, [pw_15893x2] ; stp2_27 + + mova m14, [rsp + transposed_in + 16 * 11] + mova m13, m14 + pmulhrsw m13, [pw_m8423x2] ; stp1_21 + pmulhrsw m14, [pw_14053x2] ; stp2_26 + + mova m0, [rsp + transposed_in + 16 * 13] + mova m1, m0 + pmulhrsw m0, [pw__9760x2] ; stp1_22 + pmulhrsw m1, [pw_13160x2] ; stp2_25 + + ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 + SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 + SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 + SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 + + ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 + BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 + + ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 + SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 + SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 + SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 + + ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 + BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 + + ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %3 + idx16] + mova m7, [stp + %3 + idx17] + mova m11, [stp + %3 + idx18] + mova m12, [stp + %3 + idx19] + SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 + SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 + SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 + SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 + mova [stp + %3 + idx16], m4 + mova [stp + %3 + idx17], m7 + mova [stp + %3 + idx18], m11 + mova [stp + %3 + idx19], m12 + + mova m4, [stp + %4 + idx28] + mova m7, [stp + %4 + idx29] + mova m11, [stp + %4 + idx30] + mova m12, [stp + %4 + idx31] + SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 + SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 + SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 + SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 + mova [stp + %4 + idx28], m4 + mova [stp + %4 + idx29], m7 + mova [stp + %4 + idx30], m11 + mova [stp + %4 + idx31], m12 + + ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 6, 5, 9 + pmulhrsw m6, m10 ; stp1_27 + pmulhrsw m5, m10 ; stp1_20 + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_26 + pmulhrsw m14, m10 ; stp1_21 + SUM_SUB 1, 0, 9 + pmulhrsw m1, m10 ; stp1_25 + pmulhrsw m0, m10 ; stp1_22 + SUM_SUB 2, 3, 9 + pmulhrsw m2, m10 ; stp1_25 + pmulhrsw m3, m10 ; stp1_22 +%else + BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 + SWAP 6, 5 + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 + SWAP 13, 14 + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 + SWAP 1, 0 + BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 + SWAP 2, 3 +%endif + mova [stp + %3 + idx20], m5 + mova [stp + %3 + idx21], m14 + mova [stp + %3 + idx22], m0 + mova [stp + %3 + idx23], m3 + mova [stp + %4 + idx24], m2 + mova [stp + %4 + idx25], m1 + mova [stp + %4 + idx26], m13 + mova [stp + %4 + idx27], m6 + + ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 2] + mova m1, m0 + pmulhrsw m0, [pw__1606x2] ; stp1_8 + pmulhrsw m1, [pw_16305x2] ; stp2_15 + + mova m6, [rsp + transposed_in + 16 * 6] + mova m7, m6 + pmulhrsw m7, [pw_m4756x2] ; stp2_11 + pmulhrsw m6, [pw_15679x2] ; stp1_12 + + mova m4, [rsp + transposed_in + 16 * 10] + mova m5, m4 + pmulhrsw m4, [pw__7723x2] ; stp1_10 + pmulhrsw m5, [pw_14449x2] ; stp2_13 + + mova m2, [rsp + transposed_in + 16 * 14] + mova m3, m2 + pmulhrsw m3, [pw_m10394x2] ; stp1_9 + pmulhrsw m2, [pw_12665x2] ; stp2_14 + + ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 + SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 + SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 + SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 + + ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 + BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 + + ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 + SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 + SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 + SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 + + ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 5, 4, 9 + pmulhrsw m5, m10 ; stp1_13 + pmulhrsw m4, m10 ; stp1_10 + SUM_SUB 6, 7, 9 + pmulhrsw m6, m10 ; stp1_12 + pmulhrsw m7, m10 ; stp1_11 +%else + BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 + SWAP 5, 4 + BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 + SWAP 6, 7 +%endif + ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova [stp + %2 + idx8], m0 + mova [stp + %2 + idx9], m2 + mova [stp + %2 + idx10], m4 + mova [stp + %2 + idx11], m7 + mova [stp + %2 + idx12], m6 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m3 + mova [stp + %2 + idx15], m1 + + ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; + ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m11, [rsp + transposed_in + 16 * 4] + mova m12, m11 + pmulhrsw m11, [pw__3196x2] ; stp1_4 + pmulhrsw m12, [pw_16069x2] ; stp1_7 + + mova m13, [rsp + transposed_in + 16 * 12] + mova m14, m13 + pmulhrsw m13, [pw_13623x2] ; stp1_6 + pmulhrsw m14, [pw_m9102x2] ; stp1_5 + + ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m0, [rsp + transposed_in + 16 * 0] + mova m2, [rsp + transposed_in + 16 * 8] + pmulhrsw m0, [pw_11585x2] ; stp1_1 + mova m3, m2 + pmulhrsw m2, [pw__6270x2] ; stp1_2 + pmulhrsw m3, [pw_15137x2] ; stp1_3 + + SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 + SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 + + ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +%if 0 ; overflow occurs in SUM_SUB when using test streams + mova m10, [pw_11585x2] + SUM_SUB 13, 14, 9 + pmulhrsw m13, m10 ; stp1_6 + pmulhrsw m14, m10 ; stp1_5 +%else + BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 + SWAP 13, 14 +%endif + mova m1, m0 ; stp1_0 = stp1_1 + SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 + SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 + + ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 + SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 + SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 + SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 + + ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mova m4, [stp + %2 + idx12] + mova m5, [stp + %2 + idx13] + mova m6, [stp + %2 + idx14] + mova m7, [stp + %2 + idx15] + SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 + SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 + SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 + SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 + + ; 0-3, 28-31 final stage + mova m10, [stp + %4 + idx31] + mova m15, [stp + %4 + idx30] + SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 + SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 + mova [stp + %1 + idx0], m0 + mova [stp + %1 + idx1], m1 + mova [stp + %4 + idx31], m10 + mova [stp + %4 + idx30], m15 + mova m0, [stp + %4 + idx29] + mova m1, [stp + %4 + idx28] + SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 + SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 + mova [stp + %1 + idx2], m2 + mova [stp + %1 + idx3], m3 + mova [stp + %4 + idx29], m0 + mova [stp + %4 + idx28], m1 + + ; 12-15, 16-19 final stage + mova m0, [stp + %3 + idx16] + mova m1, [stp + %3 + idx17] + mova m2, [stp + %3 + idx18] + mova m3, [stp + %3 + idx19] + SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 + SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 + SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 + SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 + mova [stp + %2 + idx12], m4 + mova [stp + %2 + idx13], m5 + mova [stp + %2 + idx14], m6 + mova [stp + %2 + idx15], m7 + mova [stp + %3 + idx16], m0 + mova [stp + %3 + idx17], m1 + mova [stp + %3 + idx18], m2 + mova [stp + %3 + idx19], m3 + + mova m4, [stp + %2 + idx8] + mova m5, [stp + %2 + idx9] + mova m6, [stp + %2 + idx10] + mova m7, [stp + %2 + idx11] + SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 + SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 + SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 + SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 + + ; 4-7, 24-27 final stage + mova m3, [stp + %4 + idx24] + mova m2, [stp + %4 + idx25] + mova m1, [stp + %4 + idx26] + mova m0, [stp + %4 + idx27] + SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 + SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 + SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 + SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 + mova [stp + %4 + idx24], m3 + mova [stp + %4 + idx25], m2 + mova [stp + %4 + idx26], m1 + mova [stp + %4 + idx27], m0 + mova [stp + %1 + idx4], m11 + mova [stp + %1 + idx5], m14 + mova [stp + %1 + idx6], m13 + mova [stp + %1 + idx7], m12 + + ; 8-11, 20-23 final stage + mova m0, [stp + %3 + idx20] + mova m1, [stp + %3 + idx21] + mova m2, [stp + %3 + idx22] + mova m3, [stp + %3 + idx23] + SUM_SUB 7, 0, 9 ; stp1_11, stp_20 + SUM_SUB 6, 1, 9 ; stp1_10, stp_21 + SUM_SUB 5, 2, 9 ; stp1_9, stp_22 + SUM_SUB 4, 3, 9 ; stp1_8, stp_23 + mova [stp + %2 + idx8], m4 + mova [stp + %2 + idx9], m5 + mova [stp + %2 + idx10], m6 + mova [stp + %2 + idx11], m7 + mova [stp + %3 + idx20], m0 + mova [stp + %3 + idx21], m1 + mova [stp + %3 + idx22], m2 + mova [stp + %3 + idx23], m3 +%endmacro + +INIT_XMM ssse3 +cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride + mova m8, [pd_8192] + mov r6, 2 + lea stp, [rsp + pass_one_start] + +idct32x32_135: + mov r3, inputq + lea r4, [rsp + transposed_in] + mov r7, 2 + +idct32x32_135_transpose: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 4] + mova m2, [r3 + 16 * 8] + mova m3, [r3 + 16 * 12] + mova m4, [r3 + 16 * 16] + mova m5, [r3 + 16 * 20] + mova m6, [r3 + 16 * 24] + mova m7, [r3 + 16 * 28] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 + add r4, 16 * 8 + dec r7 + jne idct32x32_135_transpose + + IDCT32X32_135 16*0, 16*32, 16*64, 16*96 + lea stp, [stp + 16 * 8] + lea inputq, [inputq + 16 * 32] + dec r6 + jnz idct32x32_135 + + mov r6, 4 + lea stp, [rsp + pass_one_start] + lea r9, [rsp + pass_one_start] + +idct32x32_135_2: + lea r4, [rsp + transposed_in] + mov r3, r9 + mov r7, 2 + +idct32x32_135_transpose_2: + mova m0, [r3 + 0] + mova m1, [r3 + 16 * 1] + mova m2, [r3 + 16 * 2] + mova m3, [r3 + 16 * 3] + mova m4, [r3 + 16 * 4] + mova m5, [r3 + 16 * 5] + mova m6, [r3 + 16 * 6] + mova m7, [r3 + 16 * 7] + + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + mova [r4 + 0], m0 + mova [r4 + 16 * 1], m1 + mova [r4 + 16 * 2], m2 + mova [r4 + 16 * 3], m3 + mova [r4 + 16 * 4], m4 + mova [r4 + 16 * 5], m5 + mova [r4 + 16 * 6], m6 + mova [r4 + 16 * 7], m7 + + add r3, 16 * 8 + add r4, 16 * 8 + dec r7 + jne idct32x32_135_transpose_2 + + IDCT32X32_135 16*0, 16*8, 16*16, 16*24 + + lea stp, [stp + 16 * 32] + add r9, 16 * 32 + dec r6 + jnz idct32x32_135_2 + + RECON_AND_STORE pass_two_start + + RET + %macro IDCT32X32_1024 4 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mova m1, [rsp + transposed_in + 16 * 1] -- 2.40.0