From e500591710cd1ee2a8dabb0e291a31448ace7077 Mon Sep 17 00:00:00 2001 From: DRC Date: Fri, 27 Sep 2013 17:51:08 +0000 Subject: [PATCH] SIMD-accelerated 3/4 and 3/2 decompression scaling for MIPS DSPr2 git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1047 632fc199-4ca6-4c93-a231-07263d6284db --- ChangeLog.txt | 5 +- jddctmgr.c | 10 + jsimd_none.c | 26 +++ jsimddct.h | 12 + simd/jsimd.h | 10 + simd/jsimd_mips.c | 81 +++++++ simd/jsimd_mips_dspr2.S | 488 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 630 insertions(+), 2 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index c8103e0..690dc2b 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -7,8 +7,9 @@ compatible with X Video.) Also, the decompress-to-YUV function has been extended to support image scaling. [2] Added SIMD acceleration for performing color conversion, downsampling, -and upsampling on DSPr2-capable MIPS platforms. This speeds up the compression -of full-color JPEGs by 6-21% on such platforms and decompression by 6-17%. +upsampling, and IDCT scaling on DSPr2-capable MIPS platforms. This speeds up +the compression of full-color JPEGs by 6-21% on such platforms and +decompression by 6-17%. [3] Added support for 4:1:1 subsampling to the TurboJPEG API. This is mainly included for compatibility, since 4:1:1 is not fully accelerated in diff --git a/jddctmgr.c b/jddctmgr.c index 2bb70a1..e3fabe2 100644 --- a/jddctmgr.c +++ b/jddctmgr.c @@ -133,6 +133,11 @@ start_pass (j_decompress_ptr cinfo) method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 6: +#if defined(__mips__) + if (jsimd_can_idct_6x6()) + method_ptr = jsimd_idct_6x6; + else +#endif method_ptr = jpeg_idct_6x6; method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; @@ -188,6 +193,11 @@ start_pass (j_decompress_ptr cinfo) method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; case 12: +#if defined(__mips__) + if (jsimd_can_idct_12x12()) + method_ptr = jsimd_idct_12x12; + else +#endif method_ptr = jpeg_idct_12x12; method = JDCT_ISLOW; /* jidctint uses islow-style table */ break; diff --git a/jsimd_none.c b/jsimd_none.c index 9787902..882fc08 100644 --- a/jsimd_none.c +++ b/jsimd_none.c @@ -258,6 +258,18 @@ jsimd_can_idct_4x4 (void) return 0; } +GLOBAL(int) +jsimd_can_idct_6x6 (void) +{ + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12 (void) +{ + return 0; +} + GLOBAL(void) jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, @@ -272,6 +284,20 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, { } +GLOBAL(void) +jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + +GLOBAL(void) +jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +} + GLOBAL(int) jsimd_can_idct_islow (void) { diff --git a/jsimddct.h b/jsimddct.h index a1c7440..9d2d945 100644 --- a/jsimddct.h +++ b/jsimddct.h @@ -68,6 +68,8 @@ EXTERN(void) jsimd_quantize_float JPP((JCOEFPTR coef_block, EXTERN(int) jsimd_can_idct_2x2 JPP((void)); EXTERN(int) jsimd_can_idct_4x4 JPP((void)); +EXTERN(int) jsimd_can_idct_6x6 JPP((void)); +EXTERN(int) jsimd_can_idct_12x12 JPP((void)); EXTERN(void) jsimd_idct_2x2 JPP((j_decompress_ptr cinfo, jpeg_component_info * compptr, @@ -79,6 +81,16 @@ EXTERN(void) jsimd_idct_4x4 JPP((j_decompress_ptr cinfo, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jsimd_idct_6x6 JPP((j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); +EXTERN(void) jsimd_idct_12x12 JPP((j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); EXTERN(int) jsimd_can_idct_islow JPP((void)); EXTERN(int) jsimd_can_idct_ifast JPP((void)); diff --git a/simd/jsimd.h b/simd/jsimd.h index c0fea3f..cf0618e 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -743,6 +743,16 @@ EXTERN(void) jsimd_idct_4x4_mips_dspr2 JPP((void * dct_table, JDIMENSION output_col, int * workspace)); +EXTERN(void) jsimd_idct_6x6_mips_dspr2 JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); +EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2 JPP((JCOEFPTR coef_block, + void * dct_table, + int * workspace)); +EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2 JPP((int * workspace, + int * output)); + /* SIMD Inverse DCT */ EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table, JCOEFPTR coef_block, diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c index a1b0b9b..d9996f4 100644 --- a/simd/jsimd_mips.c +++ b/simd/jsimd_mips.c @@ -571,6 +571,51 @@ jsimd_can_idct_4x4 (void) return 0; } +GLOBAL(int) +jsimd_can_idct_6x6 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_MIPS_DSPR2)) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_12x12 (void) +{ + init_simd(); + + if (BITS_IN_JSAMPLE != 8) + return 0; + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if (simd_support & JSIMD_MIPS_DSPR2) + return 1; + + return 0; +} + GLOBAL(void) jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, JCOEFPTR coef_block, JSAMPARRAY output_buf, @@ -593,6 +638,42 @@ jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, output_buf, output_col, workspace); } } +GLOBAL(void) +jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + if ((simd_support & JSIMD_MIPS_DSPR2)) + jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, + output_buf, output_col); +} + +GLOBAL(void) +jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + if (simd_support & JSIMD_MIPS_DSPR2) { + int workspace[96]; + int output[12] = { + (int)(output_buf[0] + output_col), + (int)(output_buf[1] + output_col), + (int)(output_buf[2] + output_col), + (int)(output_buf[3] + output_col), + (int)(output_buf[4] + output_col), + (int)(output_buf[5] + output_col), + (int)(output_buf[6] + output_col), + (int)(output_buf[7] + output_col), + (int)(output_buf[8] + output_col), + (int)(output_buf[9] + output_col), + (int)(output_buf[10] + output_col), + (int)(output_buf[11] + output_col), + }; + jsimd_idct_12x12_pass1_mips_dspr2(coef_block, + compptr->dct_table, workspace); + jsimd_idct_12x12_pass2_mips_dspr2(workspace, output); + } +} GLOBAL(int) jsimd_can_idct_islow (void) diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S index aafd58a..d3ff40a 100644 --- a/simd/jsimd_mips_dspr2.S +++ b/simd/jsimd_mips_dspr2.S @@ -1544,3 +1544,491 @@ LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2) END(jsimd_idct_4x4_mips_dspr2) /*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - output_buf + * a3 - output_col + */ + .set at + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu sp, sp, -144 + move v0, sp + addiu v1, v0, 24 + addiu t9, zero, 5793 + addiu s0, zero, 10033 + addiu s1, zero, 2998 + +1: + lh s2, 0(a0) // q0 = quantptr[ 0] + lh s3, 32(a0) // q1 = quantptr[16] + lh s4, 64(a0) // q2 = quantptr[32] + lh t2, 64(a1) // tmp2 = inptr[32] + lh t1, 32(a1) // tmp1 = inptr[16] + lh t0, 0(a1) // tmp0 = inptr[ 0] + mul t2, t2, s4 // tmp2 = tmp2 * q2 + mul t1, t1, s3 // tmp1 = tmp1 * q1 + mul t0, t0, s2 // tmp0 = tmp0 * q0 + lh t6, 16(a1) // z1 = inptr[ 8] + lh t8, 80(a1) // z3 = inptr[40] + lh t7, 48(a1) // z2 = inptr[24] + lh s2, 16(a0) // q0 = quantptr[ 8] + lh s4, 80(a0) // q2 = quantptr[40] + lh s3, 48(a0) // q1 = quantptr[24] + mul t2, t2, t9 // tmp2 = tmp2 * 5793 + mul t1, t1, s0 // tmp1 = tmp1 * 10033 + sll t0, t0, 13 // tmp0 = tmp0 << 13 + mul t6, t6, s2 // z1 = z1 * q0 + mul t8, t8, s4 // z3 = z3 * q2 + mul t7, t7, s3 // z2 = z2 * q1 + addu t3, t0, t2 // tmp10 = tmp0 + tmp2 + sll t2, t2, 1 // tmp2 = tmp2 << 2 + subu t4, t0, t2 // tmp11 = tmp0 - tmp2; + subu t5, t3, t1 // tmp12 = tmp10 - tmp1 + addu t3, t3, t1 // tmp10 = tmp10 + tmp1 + addu t1, t6, t8 // tmp1 = z1 + z3 + mul t1, t1, s1 // tmp1 = tmp1 * 2998 + shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 + subu t2, t6, t8 // tmp2 = z1 - z3 + subu t2, t2, t7 // tmp2 = tmp2 - z2 + sll t2, t2, 2 // tmp2 = tmp2 << 2 + addu t0, t6, t7 // tmp0 = z1 + z2 + sll t0, t0, 13 // tmp0 = tmp0 << 13 + subu s2, t8, t7 // q0 = z3 - z2 + sll s2, s2, 13 // q0 = q0 << 13 + addu t0, t0, t1 // tmp0 = tmp0 + tmp1 + addu t1, s2, t1 // tmp1 = q0 + tmp1 + addu s2, t4, t2 // q0 = tmp11 + tmp2 + subu s3, t4, t2 // q1 = tmp11 - tmp2 + addu t6, t3, t0 // z1 = tmp10 + tmp0 + subu t7, t3, t0 // z2 = tmp10 - tmp0 + addu t4, t5, t1 // tmp11 = tmp12 + tmp1 + subu t5, t5, t1 // tmp12 = tmp12 - tmp1 + shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11 + shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11 + shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11 + shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11 + sw s2, 24(v0) + sw s3, 96(v0) + sw t6, 0(v0) + sw t7, 120(v0) + sw t4, 48(v0) + sw t5, 72(v0) + addiu v0, v0, 4 + addiu a1, a1, 2 + bne v0, v1, 1b + addiu a0, a0, 2 + + /* Pass 2: process 6 rows from work array, store into output array. */ + move v0, sp + addiu v1, v0, 144 + +2: + lw t0, 0(v0) + lw t2, 16(v0) + lw s5, 0(a2) + addiu t0, t0, 16 + sll t0, t0, 13 + mul t3, t2, t9 + lw t6, 4(v0) + lw t8, 20(v0) + lw t7, 12(v0) + addu s5, s5, a3 + addu s6, t6, t8 + mul s6, s6, s1 + addu t1, t0, t3 + subu t4, t0, t3 + subu t4, t4, t3 + lw t3, 8(v0) + mul t0, t3, s0 + addu s7, t6, t7 + sll s7, s7, 13 + addu s7, s6, s7 + subu t2, t8, t7 + sll t2, t2, 13 + addu t2, s6, t2 + subu s6, t6, t7 + subu s6, s6, t8 + sll s6, s6, 13 + addu t3, t1, t0 + subu t5, t1, t0 + addu t6, t3, s7 + subu t3, t3, s7 + addu t7, t4, s6 + subu t4, t4, s6 + addu t8, t5, t2 + subu t5, t5, t2 + shll_s.w t6, t6, 6 + shll_s.w t3, t3, 6 + shll_s.w t7, t7, 6 + shll_s.w t4, t4, 6 + shll_s.w t8, t8, 6 + shll_s.w t5, t5, 6 + sra t6, t6, 24 + addiu t6, t6, 128 + sra t3, t3, 24 + addiu t3, t3, 128 + sb t6, 0(s5) + sra t7, t7, 24 + addiu t7, t7, 128 + sb t3, 5(s5) + sra t4, t4, 24 + addiu t4, t4, 128 + sb t7, 1(s5) + sra t8, t8, 24 + addiu t8, t8, 128 + sb t4, 4(s5) + addiu v0, v0, 24 + sra t5, t5, 24 + addiu t5, t5, 128 + sb t8, 2(s5) + addiu a2, a2, 4 + bne v0, v1, 2b + sb t5, 3(s5) + + addiu sp, sp, 144 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_6x6_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2) +/* + * a0 - compptr->dct_table + * a1 - coef_block + * a2 - workspace + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 8 + +1: + // odd part + lh t0, 48(a1) + lh t1, 48(a0) + lh t2, 16(a1) + lh t3, 16(a0) + lh t4, 80(a1) + lh t5, 80(a0) + lh t6, 112(a1) + lh t7, 112(a0) + mul t0, t0, t1 // z2 + mul t1, t2, t3 // z1 + mul t2, t4, t5 // z3 + mul t3, t6, t7 // z4 + li t4, 10703 // FIX(1.306562965) + li t5, 4433 // FIX_0_541196100 + li t6, 7053 // FIX(0.860918669) + mul t4, t0,t4 // tmp11 + mul t5, t0,t5 // -tmp14 + addu t7, t1,t2 // tmp10 + addu t8, t7,t3 // tmp10 + z4 + mul t6, t6, t8 // tmp15 + li t8, 2139 // FIX(0.261052384) + mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384)) + li t7, 2295 // FIX(0.280143716) + mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716)) + addu t9, t2, t3 // z3 + z4 + li s0, 8565 // FIX(1.045510580) + mul t9, t9, s0 // -tmp13 + li s0, 12112 // FIX(1.478575242) + mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242) + li s1, 12998 // FIX(1.586706681) + mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) + li s2, 5540 // FIX(0.676326758) + mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) + li s3, 16244 // FIX(1.982889723) + mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) + subu t1, t1, t3 // z1-=z4 + subu t0, t0, t2 // z2-=z3 + addu t2, t0, t1 // z1+z2 + li t3, 4433 // FIX_0_541196100 + mul t2, t2, t3 // z3 + li t3, 6270 // FIX_0_765366865 + mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) + li t3, 15137 // FIX_0_765366865 + mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) + addu t8, t6, t8 // tmp12 + addu t3, t8, t4 // tmp12 + tmp11 + addu t3, t3, t7 // tmp10 + subu t8, t8, t9 // tmp12 + tmp13 + addu s0, t5, s0 + subu t8, t8, s0 // tmp12 + subu t9, t6, t9 + subu s1, s1, t4 + addu t9, t9, s1 // tmp13 + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 // tmp15 + // even part start + lh t4, 64(a1) + lh t5, 64(a0) + lh t7, 32(a1) + lh s0, 32(a0) + lh s1, 0(a1) + lh s2, 0(a0) + lh s3, 96(a1) + lh v0, 96(a0) + mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4]) + mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2]) + mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0]) + mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6]) + // odd part end + addu t1, t2, t1 // tmp11 + subu t0, t2, t0 // tmp14 + // update counter and pointers + addiu a3, a3, -1 + addiu a0, a0, 2 + addiu a1, a1, 2 + // even part rest + li s1, 10033 + li s2, 11190 + mul t4, t4, s1 // z4 + mul s1, t5, s2 // z4 + sll t5, t5, 13 // z1 + sll t7, t7, 13 + addiu t7, t7, 1024 // z3 + sll s0, s0, 13 // z2 + addu s2, t7, t4 // tmp10 + subu t4, t7, t4 // tmp11 + subu s3, t5, s0 // tmp12 + addu t2, t7, s3 // tmp21 + subu s3, t7, s3 // tmp24 + addu t7, s1, s0 // tmp12 + addu v0, s2, t7 // tmp20 + subu s2, s2, t7 // tmp25 + subu s1, s1, t5 // z4 - z1 + subu s1, s1, s0 // tmp12 + addu s0, t4, s1 // tmp22 + subu t4, t4, s1 // tmp23 + // final output stage + addu t5, v0, t3 + subu v0, v0, t3 + addu t3, t2, t1 + subu t2, t2, t1 + addu t1, s0, t8 + subu s0, s0, t8 + addu t8, t4, t9 + subu t4, t4, t9 + addu t9, s3, t0 + subu s3, s3, t0 + addu t0, s2, t6 + subu s2, s2, t6 + sra t5, t5, 11 + sra t3, t3, 11 + sra t1, t1, 11 + sra t8, t8, 11 + sra t9, t9, 11 + sra t0, t0, 11 + sra s2, s2, 11 + sra s3, s3, 11 + sra t4, t4, 11 + sra s0, s0, 11 + sra t2, t2, 11 + sra v0, v0, 11 + sw t5, 0(a2) + sw t3, 32(a2) + sw t1, 64(a2) + sw t8, 96(a2) + sw t9, 128(a2) + sw t0, 160(a2) + sw s2, 192(a2) + sw s3, 224(a2) + sw t4, 256(a2) + sw s0, 288(a2) + sw t2, 320(a2) + sw v0, 352(a2) + bgtz a3, 1b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + j ra + nop + +END(jsimd_idct_12x12_pass1_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2) +/* + * a0 - workspace + * a1 - output + */ + + SAVE_REGS_ON_STACK 16, s0, s1, s2, s3 + + li a3, 12 + +1: + // Odd part + lw t0, 12(a0) + lw t1, 4(a0) + lw t2, 20(a0) + lw t3, 28(a0) + li t4, 10703 // FIX(1.306562965) + li t5, 4433 // FIX_0_541196100 + mul t4, t0, t4 // tmp11 + mul t5, t0, t5 // -tmp14 + addu t6, t1, t2 // tmp10 + li t7, 2139 // FIX(0.261052384) + mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384)) + addu t6, t6, t3 // tmp10 + z4 + li t8, 7053 // FIX(0.860918669) + mul t6, t6, t8 // tmp15 + li t8, 2295 // FIX(0.280143716) + mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716)) + addu t9, t2, t3 // z3 + z4 + li s0, 8565 // FIX(1.045510580) + mul t9, t9, s0 // -tmp13 + li s0, 12112 // FIX(1.478575242) + mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)) + li s1, 12998 // FIX(1.586706681) + mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681)) + li s2, 5540 // FIX(0.676326758) + mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758)) + li s3, 16244 // FIX(1.982889723) + mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723)) + subu t1, t1, t3 // z1 -= z4 + subu t0, t0, t2 // z2 -= z3 + addu t2, t1, t0 // z1 + z2 + li t3, 4433 // FIX_0_541196100 + mul t2, t2, t3 // z3 + li t3, 6270 // FIX_0_765366865 + mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865) + li t3, 15137 // FIX_1_847759065 + mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065) + addu t3, t6, t7 // tmp12 + addu t7, t3, t4 + addu t7, t7, t8 // tmp10 + subu t3, t3, t9 + subu t3, t3, t5 + subu t3, t3, s0 // tmp12 + subu t9, t6, t9 + subu t9, t9, t4 + addu t9, t9, s1 // tmp13 + subu t6, t6, t5 + subu t6, t6, s2 + subu t6, t6, s3 // tmp15 + addu t1, t2, t1 // tmp11 + subu t0, t2, t0 // tmp14 + // even part + lw t2, 16(a0) // z4 + lw t4, 8(a0) // z1 + lw t5, 0(a0) // z3 + lw t8, 24(a0) // z2 + li s0, 10033 // FIX(1.224744871) + li s1, 11190 // FIX(1.366025404) + mul t2, t2, s0 // z4 + mul s0, t4, s1 // z4 + addiu t5, t5, 0x10 + sll t5, t5, 13 // z3 + sll t4, t4, 13 // z1 + sll t8, t8, 13 // z2 + subu s1, t4, t8 // tmp12 + addu s2, t5, t2 // tmp10 + subu t2, t5, t2 // tmp11 + addu s3, t5, s1 // tmp21 + subu s1, t5, s1 // tmp24 + addu t5, s0, t8 // tmp12 + addu v0, s2, t5 // tmp20 + subu t5, s2, t5 // tmp25 + subu t4, s0, t4 + subu t4, t4, t8 // tmp12 + addu t8, t2, t4 // tmp22 + subu t2, t2, t4 // tmp23 + // increment counter and pointers + addiu a3, a3, -1 + addiu a0, a0, 32 + // Final stage + addu t4, v0, t7 + subu v0, v0, t7 + addu t7, s3, t1 + subu s3, s3, t1 + addu t1, t8, t3 + subu t8, t8, t3 + addu t3, t2, t9 + subu t2, t2, t9 + addu t9, s1, t0 + subu s1, s1, t0 + addu t0, t5, t6 + subu t5, t5, t6 + sll t4, t4, 4 + sll t7, t7, 4 + sll t1, t1, 4 + sll t3, t3, 4 + sll t9, t9, 4 + sll t0, t0, 4 + sll t5, t5, 4 + sll s1, s1, 4 + sll t2, t2, 4 + sll t8, t8, 4 + sll s3, s3, 4 + sll v0, v0, 4 + shll_s.w t4, t4, 2 + shll_s.w t7, t7, 2 + shll_s.w t1, t1, 2 + shll_s.w t3, t3, 2 + shll_s.w t9, t9, 2 + shll_s.w t0, t0, 2 + shll_s.w t5, t5, 2 + shll_s.w s1, s1, 2 + shll_s.w t2, t2, 2 + shll_s.w t8, t8, 2 + shll_s.w s3, s3, 2 + shll_s.w v0, v0, 2 + srl t4, t4, 24 + srl t7, t7, 24 + srl t1, t1, 24 + srl t3, t3, 24 + srl t9, t9, 24 + srl t0, t0, 24 + srl t5, t5, 24 + srl s1, s1, 24 + srl t2, t2, 24 + srl t8, t8, 24 + srl s3, s3, 24 + srl v0, v0, 24 + lw t6, 0(a1) + addiu t4, t4, 0x80 + addiu t7, t7, 0x80 + addiu t1, t1, 0x80 + addiu t3, t3, 0x80 + addiu t9, t9, 0x80 + addiu t0, t0, 0x80 + addiu t5, t5, 0x80 + addiu s1, s1, 0x80 + addiu t2, t2, 0x80 + addiu t8, t8, 0x80 + addiu s3, s3, 0x80 + addiu v0, v0, 0x80 + sb t4, 0(t6) + sb t7, 1(t6) + sb t1, 2(t6) + sb t3, 3(t6) + sb t9, 4(t6) + sb t0, 5(t6) + sb t5, 6(t6) + sb s1, 7(t6) + sb t2, 8(t6) + sb t8, 9(t6) + sb s3, 10(t6) + sb v0, 11(t6) + bgtz a3, 1b + addiu a1, a1, 4 + + RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3 + + jr ra + nop + +END(jsimd_idct_12x12_pass2_mips_dspr2) + +/*****************************************************************************/ \ No newline at end of file -- 2.40.0