From d3131c1b3dfde62bebf7fb52d22c1ce54c8cea8a Mon Sep 17 00:00:00 2001 From: DRC Date: Tue, 8 Oct 2013 02:18:59 +0000 Subject: [PATCH] SIMD-accelerated fast integer inverse DCT routine for MIPS DSPr2 git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1056 632fc199-4ca6-4c93-a231-07263d6284db --- simd/jsimd.h | 9 ++ simd/jsimd_mips.c | 49 ++++++- simd/jsimd_mips_dspr2.S | 293 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 349 insertions(+), 2 deletions(-) diff --git a/simd/jsimd.h b/simd/jsimd.h index c090576..ec32a8f 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -789,6 +789,15 @@ EXTERN(void) jsimd_idct_ifast_neon JPP((void * dct_table, JSAMPARRAY output_buf, JDIMENSION output_col)); +EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2 JPP((JCOEF * inptr, + IFAST_MULT_TYPE * quantptr, + DCTELEM * wsptr, + const int * idct_coefs)); +EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2 JPP((DCTELEM * wsptr, + JSAMPARRAY output_buf, + JDIMENSION output_col, + const int * idct_coefs)); + EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf, diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c index 63c7714..f3b5afe 100644 --- a/simd/jsimd_mips.c +++ b/simd/jsimd_mips.c @@ -78,6 +78,12 @@ init_simd (void) return; #endif } +static const int mips_idct_ifast_coefs[4] = { + 0x45404540, // FIX( 1.082392200 / 2) = 17734 = 0x4546 + 0x5A805A80, // FIX( 1.414213562 / 2) = 23170 = 0x5A82 + 0x76407640, // FIX( 1.847759065 / 2) = 30274 = 0x7642 + 0xAC60AC60 // FIX(-2.613125930 / 4) = -21407 = 0xAC61 +}; GLOBAL(int) jsimd_can_rgb_ycc (void) @@ -726,6 +732,25 @@ jsimd_can_idct_islow (void) GLOBAL(int) jsimd_can_idct_ifast (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + + if ((simd_support & JSIMD_MIPS_DSPR2)) + return 1; + return 0; } @@ -744,9 +769,29 @@ jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, GLOBAL(void) jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, - JCOEFPTR coef_block, JSAMPARRAY output_buf, - JDIMENSION output_col) + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) { + if (simd_support & JSIMD_MIPS_DSPR2) { + JCOEFPTR inptr; + IFAST_MULT_TYPE * quantptr; + DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; + + jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr, + workspace, mips_idct_ifast_coefs); + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf, + output_col, mips_idct_ifast_coefs); + } } GLOBAL(void) diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S index ff9ca01..bfedae7 100644 --- a/simd/jsimd_mips_dspr2.S +++ b/simd/jsimd_mips_dspr2.S @@ -963,6 +963,299 @@ LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2) j ra nop END(jsimd_h2v2_upsample_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2) +/* + * a0 - inptr + * a1 - quantptr + * a2 - wsptr + * a3 - mips_idct_ifast_coefs + */ + + SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + addiu t9, a0, 16 // end address + or AT, a3, zero + +0: + lw s0, 0(a1) // quantptr[DCTSIZE*0] + lw t0, 0(a0) // inptr[DCTSIZE*0] + lw t1, 16(a0) // inptr[DCTSIZE*1] + muleq_s.w.phl v0, t0, s0 // tmp0 ... + lw t2, 32(a0) // inptr[DCTSIZE*2] + lw t3, 48(a0) // inptr[DCTSIZE*3] + lw t4, 64(a0) // inptr[DCTSIZE*4] + lw t5, 80(a0) // inptr[DCTSIZE*5] + muleq_s.w.phr t0, t0, s0 // ... tmp0 ... + lw t6, 96(a0) // inptr[DCTSIZE*6] + lw t7, 112(a0) // inptr[DCTSIZE*7] + or s4, t1, t2 + or s5, t3, t4 + bnez s4, 1f + ins t0, v0, 16, 16 // ... tmp0 + bnez s5, 1f + or s6, t5, t6 + or s6, s6, t7 + bnez s6, 1f + sw t0, 0(a2) // wsptr[DCTSIZE*0] + sw t0, 16(a2) // wsptr[DCTSIZE*1] + sw t0, 32(a2) // wsptr[DCTSIZE*2] + sw t0, 48(a2) // wsptr[DCTSIZE*3] + sw t0, 64(a2) // wsptr[DCTSIZE*4] + sw t0, 80(a2) // wsptr[DCTSIZE*5] + sw t0, 96(a2) // wsptr[DCTSIZE*6] + sw t0, 112(a2) // wsptr[DCTSIZE*7] + addiu a0, a0, 4 + b 2f + addiu a1, a1, 4 + +1: + lw s1, 32(a1) // quantptr[DCTSIZE*2] + lw s2, 64(a1) // quantptr[DCTSIZE*4] + muleq_s.w.phl v0, t2, s1 // tmp1 ... + muleq_s.w.phr t2, t2, s1 // ... tmp1 ... + lw s0, 16(a1) // quantptr[DCTSIZE*1] + lw s1, 48(a1) // quantptr[DCTSIZE*3] + lw s3, 96(a1) // quantptr[DCTSIZE*6] + muleq_s.w.phl v1, t4, s2 // tmp2 ... + muleq_s.w.phr t4, t4, s2 // ... tmp2 ... + lw s2, 80(a1) // quantptr[DCTSIZE*5] + lw t8, 4(AT) // FIX(1.414213562) + ins t2, v0, 16, 16 // ... tmp1 + muleq_s.w.phl v0, t6, s3 // tmp3 ... + muleq_s.w.phr t6, t6, s3 // ... tmp3 ... + ins t4, v1, 16, 16 // ... tmp2 + addq.ph s4, t0, t4 // tmp10 + subq.ph s5, t0, t4 // tmp11 + ins t6, v0, 16, 16 // ... tmp3 + subq.ph s6, t2, t6 // tmp12 ... + addq.ph s7, t2, t6 // tmp13 + mulq_s.ph s6, s6, t8 // ... tmp12 ... + addq.ph t0, s4, s7 // tmp0 + subq.ph t6, s4, s7 // tmp3 + muleq_s.w.phl v0, t1, s0 // tmp4 ... + muleq_s.w.phr t1, t1, s0 // ... tmp4 ... + shll_s.ph s6, s6, 1 // x2 + lw s3, 112(a1) // quantptr[DCTSIZE*7] + subq.ph s6, s6, s7 // ... tmp12 + muleq_s.w.phl v1, t7, s3 // tmp7 ... + muleq_s.w.phr t7, t7, s3 // ... tmp7 ... + ins t1, v0, 16, 16 // ... tmp4 + addq.ph t2, s5, s6 // tmp1 + subq.ph t4, s5, s6 // tmp2 + muleq_s.w.phl v0, t5, s2 // tmp6 ... + muleq_s.w.phr t5, t5, s2 // ... tmp6 ... + ins t7, v1, 16, 16 // ... tmp7 + addq.ph s5, t1, t7 // z11 + subq.ph s6, t1, t7 // z12 + muleq_s.w.phl v1, t3, s1 // tmp5 ... + muleq_s.w.phr t3, t3, s1 // ... tmp5 ... + ins t5, v0, 16, 16 // ... tmp6 + ins t3, v1, 16, 16 // ... tmp5 + addq.ph s7, t5, t3 // z13 + subq.ph v0, t5, t3 // z10 + addq.ph t7, s5, s7 // tmp7 + subq.ph s5, s5, s7 // tmp11 ... + addq.ph v1, v0, s6 // z5 ... + mulq_s.ph s5, s5, t8 // ... tmp11 + lw t8, 8(AT) // FIX(1.847759065) + lw s4, 0(AT) // FIX(1.082392200) + addq.ph s0, t0, t7 + subq.ph s1, t0, t7 + mulq_s.ph v1, v1, t8 // ... z5 + shll_s.ph s5, s5, 1 // x2 + lw t8, 12(AT) // FIX(-2.613125930) + sw s0, 0(a2) // wsptr[DCTSIZE*0] + shll_s.ph v0, v0, 1 // x4 + mulq_s.ph v0, v0, t8 // tmp12 ... + mulq_s.ph s4, s6, s4 // tmp10 ... + shll_s.ph v1, v1, 1 // x2 + addiu a0, a0, 4 + addiu a1, a1, 4 + sw s1, 112(a2) // wsptr[DCTSIZE*7] + shll_s.ph s6, v0, 1 // x4 + shll_s.ph s4, s4, 1 // x2 + addq.ph s6, s6, v1 // ... tmp12 + subq.ph t5, s6, t7 // tmp6 + subq.ph s4, s4, v1 // ... tmp10 + subq.ph t3, s5, t5 // tmp5 + addq.ph s2, t2, t5 + addq.ph t1, s4, t3 // tmp4 + subq.ph s3, t2, t5 + sw s2, 16(a2) // wsptr[DCTSIZE*1] + sw s3, 96(a2) // wsptr[DCTSIZE*6] + addq.ph v0, t4, t3 + subq.ph v1, t4, t3 + sw v0, 32(a2) // wsptr[DCTSIZE*2] + sw v1, 80(a2) // wsptr[DCTSIZE*5] + addq.ph v0, t6, t1 + subq.ph v1, t6, t1 + sw v0, 64(a2) // wsptr[DCTSIZE*4] + sw v1, 48(a2) // wsptr[DCTSIZE*3] + +2: + bne a0, t9, 0b + addiu a2, a2, 4 + + RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7 + + j ra + nop + +END(jsimd_idct_ifast_cols_mips_dspr2) + +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2) +/* + * a0 - wsptr + * a1 - output_buf + * a2 - output_col + * a3 - mips_idct_ifast_coefs + */ + + SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + addiu t9, a0, 128 // end address + lui s8, 0x8080 + ori s8, s8, 0x8080 + +0: + lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs) + lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a + lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A + lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c + lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C + lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e + lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E + lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g + lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G + precrq.ph.w t1, s0, t0 // B b + ins t0, s0, 16, 16 // A a + bnez t1, 1f + or s0, t2, s2 + bnez s0, 1f + or s0, t4, s4 + bnez s0, 1f + or s0, t6, s6 + bnez s0, 1f + shll_s.ph s0, t0, 2 // A a + lw a3, 0(a1) + lw AT, 4(a1) + precrq.ph.w t0, s0, s0 // A A + ins s0, s0, 16, 16 // a a + addu a3, a3, a2 + addu AT, AT, a2 + precrq.qb.ph t0, t0, t0 // A A A A + precrq.qb.ph s0, s0, s0 // a a a a + addu.qb s0, s0, s8 + addu.qb t0, t0, s8 + sw s0, 0(a3) + sw s0, 4(a3) + sw t0, 0(AT) + sw t0, 4(AT) + addiu a0, a0, 32 + bne a0, t9, 0b + addiu a1, a1, 8 + b 2f + nop + +1: + precrq.ph.w t3, s2, t2 + ins t2, s2, 16, 16 + precrq.ph.w t5, s4, t4 + ins t4, s4, 16, 16 + precrq.ph.w t7, s6, t6 + ins t6, s6, 16, 16 + lw t8, 4(AT) // FIX(1.414213562) + addq.ph s4, t0, t4 // tmp10 + subq.ph s5, t0, t4 // tmp11 + subq.ph s6, t2, t6 // tmp12 ... + addq.ph s7, t2, t6 // tmp13 + mulq_s.ph s6, s6, t8 // ... tmp12 ... + addq.ph t0, s4, s7 // tmp0 + subq.ph t6, s4, s7 // tmp3 + shll_s.ph s6, s6, 1 // x2 + subq.ph s6, s6, s7 // ... tmp12 + addq.ph t2, s5, s6 // tmp1 + subq.ph t4, s5, s6 // tmp2 + addq.ph s5, t1, t7 // z11 + subq.ph s6, t1, t7 // z12 + addq.ph s7, t5, t3 // z13 + subq.ph v0, t5, t3 // z10 + addq.ph t7, s5, s7 // tmp7 + subq.ph s5, s5, s7 // tmp11 ... + addq.ph v1, v0, s6 // z5 ... + mulq_s.ph s5, s5, t8 // ... tmp11 + lw t8, 8(AT) // FIX(1.847759065) + lw s4, 0(AT) // FIX(1.082392200) + addq.ph s0, t0, t7 // tmp0 + tmp7 + subq.ph s7, t0, t7 // tmp0 - tmp7 + mulq_s.ph v1, v1, t8 // ... z5 + lw a3, 0(a1) + lw t8, 12(AT) // FIX(-2.613125930) + shll_s.ph s5, s5, 1 // x2 + addu a3, a3, a2 + shll_s.ph v0, v0, 1 // x4 + mulq_s.ph v0, v0, t8 // tmp12 ... + mulq_s.ph s4, s6, s4 // tmp10 ... + shll_s.ph v1, v1, 1 // x2 + addiu a0, a0, 32 + addiu a1, a1, 8 + shll_s.ph s6, v0, 1 // x4 + shll_s.ph s4, s4, 1 // x2 + addq.ph s6, s6, v1 // ... tmp12 + shll_s.ph s0, s0, 2 + subq.ph t5, s6, t7 // tmp6 + subq.ph s4, s4, v1 // ... tmp10 + subq.ph t3, s5, t5 // tmp5 + shll_s.ph s7, s7, 2 + addq.ph t1, s4, t3 // tmp4 + addq.ph s1, t2, t5 // tmp1 + tmp6 + subq.ph s6, t2, t5 // tmp1 - tmp6 + addq.ph s2, t4, t3 // tmp2 + tmp5 + subq.ph s5, t4, t3 // tmp2 - tmp5 + addq.ph s4, t6, t1 // tmp3 + tmp4 + subq.ph s3, t6, t1 // tmp3 - tmp4 + shll_s.ph s1, s1, 2 + shll_s.ph s2, s2, 2 + shll_s.ph s3, s3, 2 + shll_s.ph s4, s4, 2 + shll_s.ph s5, s5, 2 + shll_s.ph s6, s6, 2 + precrq.ph.w t0, s1, s0 // B A + ins s0, s1, 16, 16 // b a + precrq.ph.w t2, s3, s2 // D C + ins s2, s3, 16, 16 // d c + precrq.ph.w t4, s5, s4 // F E + ins s4, s5, 16, 16 // f e + precrq.ph.w t6, s7, s6 // H G + ins s6, s7, 16, 16 // h g + precrq.qb.ph t0, t2, t0 // D C B A + precrq.qb.ph s0, s2, s0 // d c b a + precrq.qb.ph t4, t6, t4 // H G F E + precrq.qb.ph s4, s6, s4 // h g f e + addu.qb s0, s0, s8 + addu.qb s4, s4, s8 + sw s0, 0(a3) // outptr[0/1/2/3] d c b a + sw s4, 4(a3) // outptr[4/5/6/7] h g f e + lw a3, -4(a1) + addu.qb t0, t0, s8 + addu a3, a3, a2 + addu.qb t4, t4, s8 + sw t0, 0(a3) // outptr[0/1/2/3] D C B A + bne a0, t9, 0b + sw t4, 4(a3) // outptr[4/5/6/7] H G F E + +2: + + RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3 + + j ra + nop + +END(jsimd_idct_ifast_rows_mips_dspr2) + /*****************************************************************************/ LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2) /* -- 2.40.0