From: DRC Date: Wed, 17 Dec 2014 08:04:39 +0000 (+0000) Subject: AltiVec SIMD implementation of slow integer forward DCT; Clean up fast integer forwa... X-Git-Tag: 1.4.90~146 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=fb0c394037050277c78c0e28eb0ca1590ec58ccd;p=libjpeg-turbo AltiVec SIMD implementation of slow integer forward DCT; Clean up fast integer forward DCT code so that it is easier to see how it derives from the SSE2 code and to make it play more nicely with the slow FDCT code. git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@1443 632fc199-4ca6-4c93-a231-07263d6284db --- diff --git a/simd/jsimd.h b/simd/jsimd.h index b032972..4dcdfc1 100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@ -545,6 +545,8 @@ EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data); EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data); +EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data); + /* Fast Integer Forward DCT */ EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data); diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c index a9a5965..ff37c5f 100644 --- a/simd/jsimd_powerpc.c +++ b/simd/jsimd_powerpc.c @@ -226,6 +226,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, GLOBAL(int) jsimd_can_fdct_islow (void) { + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + + if (simd_support & JSIMD_ALTIVEC) + return 1; + return 0; } @@ -255,6 +266,7 @@ jsimd_can_fdct_float (void) GLOBAL(void) jsimd_fdct_islow (DCTELEM * data) { + jsimd_fdct_islow_altivec(data); } GLOBAL(void) diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c index e18eaa8..ef32545 100644 --- a/simd/jsimd_powerpc_altivec.c +++ b/simd/jsimd_powerpc_altivec.c @@ -29,6 +29,9 @@ #include "jsimd.h" #include + +/* Common code */ + #define TRANSPOSE(row, col) \ { \ __vector short row04l, row04h, row15l, row15h, \ @@ -67,15 +70,30 @@ col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \ } -static const __vector short constants __attribute__((aligned(16))) = + +/* FAST INTEGER FORWARD DCT + * + * This is similar to the SSE2 implementation, except that we left-shift the + * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.) This is because + * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of: + * the elements in arg3 + the most significant 17 bits of + * (the elements in arg1 * the elements in arg2). + */ + +#define IFAST_CONST_BITS 8 +#define IFAST_PRE_MULTIPLY_SCALE_BITS 2 +#define IFAST_CONST_SHIFT \ + (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1) + +static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) = { - 98 << 5, /* FIX(0.382683433) */ - 139 << 5, /* FIX(0.541196100) */ - 181 << 5, /* FIX(0.707106781) */ - 334 << 5 /* FIX(1.306562965) */ + 98 << IFAST_CONST_SHIFT, /* FIX(0.382683433) */ + 139 << IFAST_CONST_SHIFT, /* FIX(0.541196100) */ + 181 << IFAST_CONST_SHIFT, /* FIX(0.707106781) */ + 334 << IFAST_CONST_SHIFT /* FIX(1.306562965) */ }; -#define DO_DCT() \ +#define DO_FDCT_IFAST() \ { \ /* Even part */ \ \ @@ -134,11 +152,266 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) /* Constants */ __vector short zero = vec_splat_s16(0), - PW_0382 = vec_splat(constants, 0), - PW_0541 = vec_splat(constants, 1), - PW_0707 = vec_splat(constants, 2), - PW_1306 = vec_splat(constants, 3); - __vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2); + PW_0382 = vec_splat(jconst_fdct_ifast, 0), + PW_0541 = vec_splat(jconst_fdct_ifast, 1), + PW_0707 = vec_splat(jconst_fdct_ifast, 2), + PW_1306 = vec_splat(jconst_fdct_ifast, 3); + __vector unsigned short PRE_MULTIPLY_SCALE_BITS = + vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS); + + /* Pass 1: process rows. */ + + row0 = *(__vector short *)&data[0]; + row1 = *(__vector short *)&data[8]; + row2 = *(__vector short *)&data[16]; + row3 = *(__vector short *)&data[24]; + row4 = *(__vector short *)&data[32]; + row5 = *(__vector short *)&data[40]; + row6 = *(__vector short *)&data[48]; + row7 = *(__vector short *)&data[56]; + + TRANSPOSE(row, col); + + tmp0 = vec_add(col0, col7); + tmp7 = vec_sub(col0, col7); + tmp1 = vec_add(col1, col6); + tmp6 = vec_sub(col1, col6); + tmp2 = vec_add(col2, col5); + tmp5 = vec_sub(col2, col5); + tmp3 = vec_add(col3, col4); + tmp4 = vec_sub(col3, col4); + + DO_FDCT_IFAST(); + + /* Pass 2: process columns. */ + + TRANSPOSE(out, row); + + tmp0 = vec_add(row0, row7); + tmp7 = vec_sub(row0, row7); + tmp1 = vec_add(row1, row6); + tmp6 = vec_sub(row1, row6); + tmp2 = vec_add(row2, row5); + tmp5 = vec_sub(row2, row5); + tmp3 = vec_add(row3, row4); + tmp4 = vec_sub(row3, row4); + + DO_FDCT_IFAST(); + + *(__vector short *)&data[0] = out0; + *(__vector short *)&data[8] = out1; + *(__vector short *)&data[16] = out2; + *(__vector short *)&data[24] = out3; + *(__vector short *)&data[32] = out4; + *(__vector short *)&data[40] = out5; + *(__vector short *)&data[48] = out6; + *(__vector short *)&data[56] = out7; +} + + +/* SLOW INTEGER FORWARD DCT */ + +#define F_0_298 2446 /* FIX(0.298631336) */ +#define F_0_390 3196 /* FIX(0.390180644) */ +#define F_0_541 4433 /* FIX(0.541196100) */ +#define F_0_765 6270 /* FIX(0.765366865) */ +#define F_0_899 7373 /* FIX(0.899976223) */ +#define F_1_175 9633 /* FIX(1.175875602) */ +#define F_1_501 12299 /* FIX(1.501321110) */ +#define F_1_847 15137 /* FIX(1.847759065) */ +#define F_1_961 16069 /* FIX(1.961570560) */ +#define F_2_053 16819 /* FIX(2.053119869) */ +#define F_2_562 20995 /* FIX(2.562915447) */ +#define F_3_072 25172 /* FIX(3.072711026) */ + +#define ISLOW_CONST_BITS 13 +#define ISLOW_PASS1_BITS 2 +#define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS) +#define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS) + +static const __vector int jconst_fdct_islow __attribute__((aligned(16))) = +{ + 1 << (ISLOW_DESCALE_P1 - 1), + 1 << (ISLOW_DESCALE_P2 - 1) +}; + +static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) = +{ + 1 << (ISLOW_PASS1_BITS - 1) +}; + +#define DO_FDCT_ISLOW_COMMON(PASS) \ +{ \ + tmp1312l = vec_mergeh(tmp13, tmp12); \ + tmp1312h = vec_mergel(tmp13, tmp12); \ + \ + out2l = vec_msums(tmp1312l, PW_F130_F054, zero); \ + out2h = vec_msums(tmp1312h, PW_F130_F054, zero); \ + out6l = vec_msums(tmp1312l, PW_F054_MF130, zero); \ + out6h = vec_msums(tmp1312h, PW_F054_MF130, zero); \ + \ + out2l = vec_add(out2l, PD_DESCALE_P##PASS); \ + out2h = vec_add(out2h, PD_DESCALE_P##PASS); \ + out2l = vec_sr(out2l, DESCALE_P##PASS); \ + out2h = vec_sr(out2h, DESCALE_P##PASS); \ + \ + out6l = vec_add(out6l, PD_DESCALE_P##PASS); \ + out6h = vec_add(out6h, PD_DESCALE_P##PASS); \ + out6l = vec_sr(out6l, DESCALE_P##PASS); \ + out6h = vec_sr(out6h, DESCALE_P##PASS); \ + \ + out2 = vec_pack(out2l, out2h); \ + out6 = vec_pack(out6l, out6h); \ + \ + /* Odd part */ \ + \ + z3 = vec_add(tmp4, tmp6); \ + z4 = vec_add(tmp5, tmp7); \ + \ + z34l = vec_mergeh(z3, z4); \ + z34h = vec_mergel(z3, z4); \ + \ + z3l = vec_msums(z34l, PW_MF078_F117, zero); \ + z3h = vec_msums(z34h, PW_MF078_F117, zero); \ + z4l = vec_msums(z34l, PW_F117_F078, zero); \ + z4h = vec_msums(z34h, PW_F117_F078, zero); \ + \ + tmp47l = vec_mergeh(tmp4, tmp7); \ + tmp47h = vec_mergel(tmp4, tmp7); \ + \ + tmp4l = vec_msums(tmp47l, PW_MF060_MF089, zero); \ + tmp4h = vec_msums(tmp47h, PW_MF060_MF089, zero); \ + tmp7l = vec_msums(tmp47l, PW_MF089_F060, zero); \ + tmp7h = vec_msums(tmp47h, PW_MF089_F060, zero); \ + \ + out7l = vec_add(z3l, tmp4l); \ + out7h = vec_add(z3h, tmp4h); \ + out1l = vec_add(z4l, tmp7l); \ + out1h = vec_add(z4h, tmp7h); \ + \ + out7l = vec_add(out7l, PD_DESCALE_P##PASS); \ + out7h = vec_add(out7h, PD_DESCALE_P##PASS); \ + out7l = vec_sr(out7l, DESCALE_P##PASS); \ + out7h = vec_sr(out7h, DESCALE_P##PASS); \ + \ + out1l = vec_add(out1l, PD_DESCALE_P##PASS); \ + out1h = vec_add(out1h, PD_DESCALE_P##PASS); \ + out1l = vec_sr(out1l, DESCALE_P##PASS); \ + out1h = vec_sr(out1h, DESCALE_P##PASS); \ + \ + out7 = vec_pack(out7l, out7h); \ + out1 = vec_pack(out1l, out1h); \ + \ + tmp56l = vec_mergeh(tmp5, tmp6); \ + tmp56h = vec_mergel(tmp5, tmp6); \ + \ + tmp5l = vec_msums(tmp56l, PW_MF050_MF256, zero); \ + tmp5h = vec_msums(tmp56h, PW_MF050_MF256, zero); \ + tmp6l = vec_msums(tmp56l, PW_MF256_F050, zero); \ + tmp6h = vec_msums(tmp56h, PW_MF256_F050, zero); \ + \ + out5l = vec_add(tmp5l, z4l); \ + out5h = vec_add(tmp5h, z4h); \ + out3l = vec_add(tmp6l, z3l); \ + out3h = vec_add(tmp6h, z3h); \ + \ + out5l = vec_add(out5l, PD_DESCALE_P##PASS); \ + out5h = vec_add(out5h, PD_DESCALE_P##PASS); \ + out5l = vec_sr(out5l, DESCALE_P##PASS); \ + out5h = vec_sr(out5h, DESCALE_P##PASS); \ + \ + out3l = vec_add(out3l, PD_DESCALE_P##PASS); \ + out3h = vec_add(out3h, PD_DESCALE_P##PASS); \ + out3l = vec_sr(out3l, DESCALE_P##PASS); \ + out3h = vec_sr(out3h, DESCALE_P##PASS); \ + \ + out5 = vec_pack(out5l, out5h); \ + out3 = vec_pack(out3l, out3h); \ +} + +#define DO_FDCT_ISLOW_ROWS() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_sl(out0, PASS1_BITS); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_sl(out4, PASS1_BITS); \ + \ + DO_FDCT_ISLOW_COMMON(1); \ +} + +#define DO_FDCT_ISLOW_COLS() \ +{ \ + /* Even part */ \ + \ + tmp10 = vec_add(tmp0, tmp3); \ + tmp13 = vec_sub(tmp0, tmp3); \ + tmp11 = vec_add(tmp1, tmp2); \ + tmp12 = vec_sub(tmp1, tmp2); \ + \ + out0 = vec_add(tmp10, tmp11); \ + out0 = vec_add(out0, PW_DESCALE_P2X); \ + out0 = vec_sra(out0, PASS1_BITS); \ + out4 = vec_sub(tmp10, tmp11); \ + out4 = vec_add(out4, PW_DESCALE_P2X); \ + out4 = vec_sra(out4, PASS1_BITS); \ + \ + DO_FDCT_ISLOW_COMMON(2); \ +} + +void +jsimd_fdct_islow_altivec (DCTELEM *data) +{ + __vector short row0, row1, row2, row3, row4, row5, row6, row7, + col0, col1, col2, col3, col4, col5, col6, col7, + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, + tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h, + z3, z4, z34l, z34h, + out0, out1, out2, out3, out4, out5, out6, out7; + __vector int tmp4l, tmp4h, tmp5l, tmp5h, tmp6l, tmp6h, tmp7l, tmp7h, + z3l, z3h, z4l, z4h, + out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h, + out7l, out7h; + + __vector short PW_F130_F054 = {F_0_541 + F_0_765, F_0_541, + F_0_541 + F_0_765, F_0_541, F_0_541 + F_0_765, F_0_541, + F_0_541 + F_0_765, F_0_541}; + __vector short PW_F054_MF130 = {F_0_541, F_0_541 - F_1_847, + F_0_541, F_0_541 - F_1_847, F_0_541, F_0_541 - F_1_847, + F_0_541, F_0_541 - F_1_847}; + __vector short PW_MF078_F117 = {F_1_175 - F_1_961, F_1_175, + F_1_175 - F_1_961, F_1_175, F_1_175 - F_1_961, F_1_175, + F_1_175 - F_1_961, F_1_175}; + __vector short PW_F117_F078 = {F_1_175, F_1_175 - F_0_390, + F_1_175, F_1_175 - F_0_390, F_1_175, F_1_175 - F_0_390, + F_1_175, F_1_175 - F_0_390}; + __vector short PW_MF060_MF089 = {F_0_298 - F_0_899, -F_0_899, + F_0_298 - F_0_899, -F_0_899, F_0_298 - F_0_899, -F_0_899, + F_0_298 - F_0_899, -F_0_899}; + __vector short PW_MF089_F060 = {-F_0_899, F_1_501 - F_0_899, + -F_0_899, F_1_501 - F_0_899, -F_0_899, F_1_501 - F_0_899, + -F_0_899, F_1_501 - F_0_899}; + __vector short PW_MF050_MF256 = {F_2_053 - F_2_562, -F_2_562, + F_2_053 - F_2_562, -F_2_562, F_2_053 - F_2_562, -F_2_562, + F_2_053 - F_2_562, -F_2_562}; + __vector short PW_MF256_F050 = {-F_2_562, F_3_072 - F_2_562, + -F_2_562, F_3_072 - F_2_562, -F_2_562, F_3_072 - F_2_562, + -F_2_562, F_3_072 - F_2_562}; + __vector short PW_DESCALE_P2X = vec_splat(jconst_fdct_islow2, 0); + + /* Constants */ + __vector unsigned short PASS1_BITS = vec_splat_u16(ISLOW_PASS1_BITS); + __vector int zero = vec_splat_s32(0), + PD_DESCALE_P1 = vec_splat(jconst_fdct_islow, 0), + PD_DESCALE_P2 = vec_splat(jconst_fdct_islow, 1); + __vector unsigned int DESCALE_P1 = vec_splat_u32(ISLOW_DESCALE_P1), + DESCALE_P2 = vec_splat_u32(ISLOW_DESCALE_P2); /* Pass 1: process rows. */ @@ -162,7 +435,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) tmp3 = vec_add(col3, col4); tmp4 = vec_sub(col3, col4); - DO_DCT(); + DO_FDCT_ISLOW_ROWS(); /* Pass 2: process columns. */ @@ -177,7 +450,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data) tmp3 = vec_add(row3, row4); tmp4 = vec_sub(row3, row4); - DO_DCT(); + DO_FDCT_ISLOW_COLS(); *(__vector short *)&data[0] = out0; *(__vector short *)&data[8] = out1;