From: DRC Date: Wed, 9 Oct 2013 18:39:44 +0000 (+0000) Subject: SIMD-accelerated floating point quantize and convsamp routines for MIPS DSPr2 X-Git-Tag: 1.3.90~190 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3d727281696a197a12ba35ceb0beb36bd3938db3;p=libjpeg-turbo SIMD-accelerated floating point quantize and convsamp routines for MIPS DSPr2 git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1058 632fc199-4ca6-4c93-a231-07263d6284db --- diff --git a/simd/jsimd.h b/simd/jsimd.h index ec32a8f..e5c5d44 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -663,6 +663,10 @@ EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace)); +EXTERN(void) jsimd_convsamp_float_mips_dspr2 JPP((JSAMPARRAY sample_data, + JDIMENSION start_col, + FAST_FLOAT * workspace)); + /* SIMD Forward DCT */ EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data)); EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data)); @@ -711,6 +715,10 @@ EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace)); +EXTERN(void) jsimd_quantize_float_mips_dspr2 JPP((JCOEFPTR coef_block, + FAST_FLOAT * divisors, + FAST_FLOAT * workspace)); + /* SIMD Reduced Inverse DCT */ EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table, JCOEFPTR coef_block, diff --git a/simd/jsimd_mips.c b/simd/jsimd_mips.c index f3b5afe..d8d6b19 100644 --- a/simd/jsimd_mips.c +++ b/simd/jsimd_mips.c @@ -459,6 +459,23 @@ jsimd_can_convsamp (void) GLOBAL(int) jsimd_can_convsamp_float (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_MIPS_DSPR2)) + return 1; + return 0; } @@ -472,6 +489,8 @@ GLOBAL(void) jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT * workspace) { + if ((simd_support & JSIMD_MIPS_DSPR2)) + jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace); } GLOBAL(int) @@ -555,6 +574,23 @@ jsimd_can_quantize (void) GLOBAL(int) jsimd_can_quantize_float (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + + if ((simd_support & JSIMD_MIPS_DSPR2)) + return 1; + return 0; } @@ -570,6 +606,8 @@ GLOBAL(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, FAST_FLOAT * workspace) { + if ((simd_support & JSIMD_MIPS_DSPR2)) + jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace); } GLOBAL(int) diff --git a/simd/jsimd_mips_dspr2.S b/simd/jsimd_mips_dspr2.S index bfedae7..d478a6d 100644 --- a/simd/jsimd_mips_dspr2.S +++ b/simd/jsimd_mips_dspr2.S @@ -1665,6 +1665,86 @@ LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2) END(jsimd_quantize_mips_dspr2) +/*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2) +/* + * a0 - coef_block + * a1 - divisors + * a2 - workspace + */ + + .set at + + li t1, 0x46800100 //integer representation 16384.5 + mtc1 t1, f0 + li t0, 63 +0: + lwc1 f1, 0(a2) + lwc1 f5, 0(a1) + lwc1 f2, 4(a2) + lwc1 f6, 4(a1) + lwc1 f3, 8(a2) + lwc1 f7, 8(a1) + lwc1 f4, 12(a2) + lwc1 f8, 12(a1) + madd.s f1, f0, f1, f5 + madd.s f2, f0, f2, f6 + madd.s f3, f0, f3, f7 + madd.s f4, f0, f4, f8 + lwc1 f5, 16(a1) + lwc1 f6, 20(a1) + trunc.w.s f1, f1 + trunc.w.s f2, f2 + trunc.w.s f3, f3 + trunc.w.s f4, f4 + lwc1 f7, 24(a1) + lwc1 f8, 28(a1) + mfc1 t1, f1 + mfc1 t2, f2 + mfc1 t3, f3 + mfc1 t4, f4 + lwc1 f1, 16(a2) + lwc1 f2, 20(a2) + lwc1 f3, 24(a2) + lwc1 f4, 28(a2) + madd.s f1, f0, f1, f5 + madd.s f2, f0, f2, f6 + madd.s f3, f0, f3, f7 + madd.s f4, f0, f4, f8 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + trunc.w.s f1, f1 + trunc.w.s f2, f2 + trunc.w.s f3, f3 + trunc.w.s f4, f4 + sh t1, 0(a0) + sh t2, 2(a0) + sh t3, 4(a0) + sh t4, 6(a0) + mfc1 t1, f1 + mfc1 t2, f2 + mfc1 t3, f3 + mfc1 t4, f4 + addiu t0, t0, -8 + addiu a2, a2, 32 + addiu a1, a1, 32 + addiu t1, t1, -16384 + addiu t2, t2, -16384 + addiu t3, t3, -16384 + addiu t4, t4, -16384 + sh t1, 8(a0) + sh t2, 10(a0) + sh t3, 12(a0) + sh t4, 14(a0) + bgez t0, 0b + addiu a0, a0, 16 + + j ra + nop + +END(jsimd_quantize_float_mips_dspr2) /*****************************************************************************/ LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2) /* @@ -2733,3 +2813,363 @@ LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2) END(jsimd_idct_12x12_pass2_mips_dspr2) /*****************************************************************************/ +LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2) +/* + * a0 - sample_data + * a1 - start_col + * a2 - workspace + */ + + .set at + + lw t0, 0(a0) + addu t0, t0, a1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + lw t0, 4(a0) + swc1 f1, 0(a2) + swc1 f2, 4(a2) + swc1 f3, 8(a2) + addu t0, t0, a1 + swc1 f4, 12(a2) + swc1 f5, 16(a2) + swc1 f6, 20(a2) + swc1 f7, 24(a2) + swc1 f8, 28(a2) + //elemr 1 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + lw t0, 8(a0) + swc1 f1, 32(a2) + swc1 f2, 36(a2) + swc1 f3, 40(a2) + addu t0, t0, a1 + swc1 f4, 44(a2) + swc1 f5, 48(a2) + swc1 f6, 52(a2) + swc1 f7, 56(a2) + swc1 f8, 60(a2) + //elemr 2 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + lw t0, 12(a0) + swc1 f1, 64(a2) + swc1 f2, 68(a2) + swc1 f3, 72(a2) + addu t0, t0, a1 + swc1 f4, 76(a2) + swc1 f5, 80(a2) + swc1 f6, 84(a2) + swc1 f7, 88(a2) + swc1 f8, 92(a2) + //elemr 3 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + lw t0, 16(a0) + swc1 f1, 96(a2) + swc1 f2, 100(a2) + swc1 f3, 104(a2) + addu t0, t0, a1 + swc1 f4, 108(a2) + swc1 f5, 112(a2) + swc1 f6, 116(a2) + swc1 f7, 120(a2) + swc1 f8, 124(a2) + //elemr 4 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + lw t0, 20(a0) + swc1 f1, 128(a2) + swc1 f2, 132(a2) + swc1 f3, 136(a2) + addu t0, t0, a1 + swc1 f4, 140(a2) + swc1 f5, 144(a2) + swc1 f6, 148(a2) + swc1 f7, 152(a2) + swc1 f8, 156(a2) + //elemr 5 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + lw t0, 24(a0) + swc1 f1, 160(a2) + swc1 f2, 164(a2) + swc1 f3, 168(a2) + addu t0, t0, a1 + swc1 f4, 172(a2) + swc1 f5, 176(a2) + swc1 f6, 180(a2) + swc1 f7, 184(a2) + swc1 f8, 188(a2) + //elemr 6 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + lw t0, 28(a0) + swc1 f1, 192(a2) + swc1 f2, 196(a2) + swc1 f3, 200(a2) + addu t0, t0, a1 + swc1 f4, 204(a2) + swc1 f5, 208(a2) + swc1 f6, 212(a2) + swc1 f7, 216(a2) + swc1 f8, 220(a2) + //elemr 7 + lbu t1, 0(t0) + lbu t2, 1(t0) + lbu t3, 2(t0) + lbu t4, 3(t0) + lbu t5, 4(t0) + lbu t6, 5(t0) + lbu t7, 6(t0) + lbu t8, 7(t0) + addiu t1, t1, -128 + addiu t2, t2, -128 + addiu t3, t3, -128 + addiu t4, t4, -128 + addiu t5, t5, -128 + addiu t6, t6, -128 + addiu t7, t7, -128 + addiu t8, t8, -128 + mtc1 t1, f1 + mtc1 t2, f2 + mtc1 t3, f3 + mtc1 t4, f4 + mtc1 t5, f5 + mtc1 t6, f6 + mtc1 t7, f7 + mtc1 t8, f8 + cvt.s.w f1, f1 + cvt.s.w f2, f2 + cvt.s.w f3, f3 + cvt.s.w f4, f4 + cvt.s.w f5, f5 + cvt.s.w f6, f6 + cvt.s.w f7, f7 + cvt.s.w f8, f8 + swc1 f1, 224(a2) + swc1 f2, 228(a2) + swc1 f3, 232(a2) + swc1 f4, 236(a2) + swc1 f5, 240(a2) + swc1 f6, 244(a2) + swc1 f7, 248(a2) + swc1 f8, 252(a2) + + j ra + nop + +END(jsimd_convsamp_float_mips_dspr2) + +/*****************************************************************************/ + diff --git a/simd/jsimd_mips_dspr2_asm.h b/simd/jsimd_mips_dspr2_asm.h index 53cf2bc..50ec31b 100644 --- a/simd/jsimd_mips_dspr2_asm.h +++ b/simd/jsimd_mips_dspr2_asm.h @@ -56,6 +56,39 @@ #define s8 $30 #define ra $31 +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + /* * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2 */