if SIMD_POWERPC
-libsimd_la_SOURCES = jsimd_powerpc.c jsimd_powerpc_altivec.c
-libsimd_la_CFLAGS = -maltivec
+libsimd_la_SOURCES = jsimd_powerpc.c \
+ jfdctfst-altivec.c jfdctint-altivec.c \
+ jidctfst-altivec.c jidctint-altivec.c
+libsimd_la_CFLAGS = -maltivec
endif
--- /dev/null
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ * the elements in arg3 + the most significant 17 bits of
+ * (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_382 98 /* FIX(0.382683433) */
+#define F_0_541 139 /* FIX(0.541196100) */
+#define F_0_707 181 /* FIX(0.707106781) */
+#define F_1_306 334 /* FIX(1.306562965) */
+
+#define CONST_BITS 8
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_FDCT() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out4 = vec_sub(tmp10, tmp11); \
+ \
+ z1 = vec_add(tmp12, tmp13); \
+ z1 = vec_sl(z1, pre_multiply_scale_bits); \
+ z1 = vec_madds(z1, pw_0707, zero); \
+ \
+ out2 = vec_add(tmp13, z1); \
+ out6 = vec_sub(tmp13, z1); \
+ \
+ /* Odd part */ \
+ \
+ tmp10 = vec_add(tmp4, tmp5); \
+ tmp11 = vec_add(tmp5, tmp6); \
+ tmp12 = vec_add(tmp6, tmp7); \
+ \
+ tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ z5 = vec_sub(tmp10, tmp12); \
+ z5 = vec_madds(z5, pw_0382, zero); \
+ \
+ z2 = vec_madds(tmp10, pw_0541, z5); \
+ z4 = vec_madds(tmp12, pw_1306, z5); \
+ \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ z3 = vec_madds(tmp11, pw_0707, zero); \
+ \
+ z11 = vec_add(tmp7, z3); \
+ z13 = vec_sub(tmp7, z3); \
+ \
+ out5 = vec_add(z13, z2); \
+ out3 = vec_sub(z13, z2); \
+ out1 = vec_add(z11, z4); \
+ out7 = vec_sub(z11, z4); \
+}
+
+
+void
+jsimd_fdct_ifast_altivec (DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z1, z2, z3, z4, z5, z11, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+
+ /* Constants */
+ __vector short zero = vec_splat_s16(0),
+ pw_0382 = { __8X(F_0_382 << CONST_SHIFT) },
+ pw_0541 = { __8X(F_0_541 << CONST_SHIFT) },
+ pw_0707 = { __8X(F_0_707 << CONST_SHIFT) },
+ pw_1306 = { __8X(F_1_306 << CONST_SHIFT) };
+ __vector unsigned short
+ pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) };
+
+ /* Pass 1: process rows. */
+
+ row0 = *(__vector short *)&data[0];
+ row1 = *(__vector short *)&data[8];
+ row2 = *(__vector short *)&data[16];
+ row3 = *(__vector short *)&data[24];
+ row4 = *(__vector short *)&data[32];
+ row5 = *(__vector short *)&data[40];
+ row6 = *(__vector short *)&data[48];
+ row7 = *(__vector short *)&data[56];
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT();
+
+ /* Pass 2: process columns. */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT();
+
+ *(__vector short *)&data[0] = out0;
+ *(__vector short *)&data[8] = out1;
+ *(__vector short *)&data[16] = out2;
+ *(__vector short *)&data[24] = out3;
+ *(__vector short *)&data[32] = out4;
+ *(__vector short *)&data[40] = out5;
+ *(__vector short *)&data[48] = out6;
+ *(__vector short *)&data[56] = out7;
+}
--- /dev/null
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) \
+{ \
+ tmp1312l = vec_mergeh(tmp13, tmp12); \
+ tmp1312h = vec_mergel(tmp13, tmp12); \
+ \
+ out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+ out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+ out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+ out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+ \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(tmp4, tmp6); \
+ z4 = vec_add(tmp5, tmp7); \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+ z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+ z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+ z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+ \
+ tmp47l = vec_mergeh(tmp4, tmp7); \
+ tmp47h = vec_mergel(tmp4, tmp7); \
+ \
+ out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+ out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+ out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+ out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+ \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ \
+ out7 = vec_pack(out7l, out7h); \
+ out1 = vec_pack(out1l, out1h); \
+ \
+ tmp56l = vec_mergeh(tmp5, tmp6); \
+ tmp56h = vec_mergel(tmp5, tmp6); \
+ \
+ out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+ out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+ out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+ out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+ \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ \
+ out5 = vec_pack(out5l, out5h); \
+ out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_ROWS() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_sl(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_sl(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_COLS() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_add(out0, pw_descale_p2x); \
+ out0 = vec_sra(out0, pass1_bits); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_add(out4, pw_descale_p2x); \
+ out4 = vec_sra(out4, pass1_bits); \
+ \
+ DO_FDCT_COMMON(2); \
+}
+
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+ z3, z4, z34l, z34h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int z3l, z3h, z4l, z4h,
+ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+ out7l, out7h;
+
+ /* Constants */
+ __vector short
+ pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+ pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+ pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+ pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+ pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+ pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+ pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+ pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+ pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+ __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+ __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+ pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+ __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+ descale_p2 = { __4X(DESCALE_P2) };
+
+ /* Pass 1: process rows. */
+
+ row0 = *(__vector short *)&data[0];
+ row1 = *(__vector short *)&data[8];
+ row2 = *(__vector short *)&data[16];
+ row3 = *(__vector short *)&data[24];
+ row4 = *(__vector short *)&data[32];
+ row5 = *(__vector short *)&data[40];
+ row6 = *(__vector short *)&data[48];
+ row7 = *(__vector short *)&data[56];
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT_ROWS();
+
+ /* Pass 2: process columns. */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT_COLS();
+
+ *(__vector short *)&data[0] = out0;
+ *(__vector short *)&data[8] = out1;
+ *(__vector short *)&data[16] = out2;
+ *(__vector short *)&data[24] = out3;
+ *(__vector short *)&data[32] = out4;
+ *(__vector short *)&data[40] = out5;
+ *(__vector short *)&data[48] = out6;
+ *(__vector short *)&data[56] = out7;
+}
--- /dev/null
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in CONST_SHIFT.) This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ * the elements in arg3 + the most significant 17 bits of
+ * (the elements in arg1 * the elements in arg2).
+ */
+
+#include "jsimd_altivec.h"
+
+
+#define F_1_082 277 /* FIX(1.082392200) */
+#define F_1_414 362 /* FIX(1.414213562) */
+#define F_1_847 473 /* FIX(1.847759065) */
+#define F_2_613 669 /* FIX(2.613125930) */
+#define F_1_613 (F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
+
+#define CONST_BITS 8
+#define PASS1_BITS 2
+#define PRE_MULTIPLY_SCALE_BITS 2
+#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+
+
+#define DO_IDCT(in) \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(in##0, in##4); \
+ tmp11 = vec_sub(in##0, in##4); \
+ tmp13 = vec_add(in##2, in##6); \
+ \
+ tmp12 = vec_sub(in##2, in##6); \
+ tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+ tmp12 = vec_madds(tmp12, pw_F1414, zero); \
+ tmp12 = vec_sub(tmp12, tmp13); \
+ \
+ tmp0 = vec_add(tmp10, tmp13); \
+ tmp3 = vec_sub(tmp10, tmp13); \
+ tmp1 = vec_add(tmp11, tmp12); \
+ tmp2 = vec_sub(tmp11, tmp12); \
+ \
+ /* Odd part */ \
+ \
+ z13 = vec_add(in##5, in##3); \
+ z10 = vec_sub(in##5, in##3); \
+ z10s = vec_sl(z10, pre_multiply_scale_bits); \
+ z11 = vec_add(in##1, in##7); \
+ z12s = vec_sub(in##1, in##7); \
+ z12s = vec_sl(z12s, pre_multiply_scale_bits); \
+ \
+ tmp11 = vec_sub(z11, z13); \
+ tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+ tmp11 = vec_madds(tmp11, pw_F1414, zero); \
+ \
+ tmp7 = vec_add(z11, z13); \
+ \
+ z5 = vec_add(z10s, z12s); \
+ z5 = vec_madds(z5, pw_F1847, zero); \
+ \
+ tmp10 = vec_madds(z12s, pw_F1082, zero); \
+ tmp10 = vec_sub(tmp10, z5); \
+ tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+ tmp12 = vec_sub(tmp12, z10); \
+ \
+ tmp6 = vec_sub(tmp12, tmp7); \
+ tmp5 = vec_sub(tmp11, tmp6); \
+ tmp4 = vec_add(tmp10, tmp5); \
+ \
+ out0 = vec_add(tmp0, tmp7); \
+ out1 = vec_add(tmp1, tmp6); \
+ out2 = vec_add(tmp2, tmp5); \
+ out3 = vec_sub(tmp3, tmp4); \
+ out4 = vec_add(tmp3, tmp4); \
+ out5 = vec_sub(tmp2, tmp5); \
+ out6 = vec_sub(tmp1, tmp6); \
+ out7 = vec_sub(tmp0, tmp7); \
+}
+
+
+void
+jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ z5, z10, z10s, z11, z12s, z13,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector signed char outb;
+ long long *outptr, *outbptr = (long long *)(&outb);
+
+ /* Constants */
+ __vector short zero = { __8X(0) },
+ pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
+ pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
+ pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
+ pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
+ __vector unsigned short
+ pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
+ pass1_bits3 = { __8X(PASS1_BITS + 3) };
+ __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+ /* Pass 1: process columns. */
+
+ col0 = *(__vector short *)&coef_block[0];
+ col1 = *(__vector short *)&coef_block[8];
+ col2 = *(__vector short *)&coef_block[16];
+ col3 = *(__vector short *)&coef_block[24];
+ col4 = *(__vector short *)&coef_block[32];
+ col5 = *(__vector short *)&coef_block[40];
+ col6 = *(__vector short *)&coef_block[48];
+ col7 = *(__vector short *)&coef_block[56];
+
+ tmp1 = vec_or(col1, col2);
+ tmp2 = vec_or(col3, col4);
+ tmp1 = vec_or(tmp1, tmp2);
+ tmp3 = vec_or(col5, col6);
+ tmp3 = vec_or(tmp3, col7);
+ tmp1 = vec_or(tmp1, tmp3);
+
+ quant0 = *(__vector short *)&dct_table[0];
+ col0 = vec_mladd(col0, quant0, zero);
+
+ if (vec_all_eq(tmp1, zero)) {
+ /* AC terms all zero */
+
+ row0 = vec_splat(col0, 0);
+ row1 = vec_splat(col0, 1);
+ row2 = vec_splat(col0, 2);
+ row3 = vec_splat(col0, 3);
+ row4 = vec_splat(col0, 4);
+ row5 = vec_splat(col0, 5);
+ row6 = vec_splat(col0, 6);
+ row7 = vec_splat(col0, 7);
+
+ } else {
+
+ quant1 = *(__vector short *)&dct_table[8];
+ quant2 = *(__vector short *)&dct_table[16];
+ quant3 = *(__vector short *)&dct_table[24];
+ quant4 = *(__vector short *)&dct_table[32];
+ quant5 = *(__vector short *)&dct_table[40];
+ quant6 = *(__vector short *)&dct_table[48];
+ quant7 = *(__vector short *)&dct_table[56];
+
+ col1 = vec_mladd(col1, quant1, zero);
+ col2 = vec_mladd(col2, quant2, zero);
+ col3 = vec_mladd(col3, quant3, zero);
+ col4 = vec_mladd(col4, quant4, zero);
+ col5 = vec_mladd(col5, quant5, zero);
+ col6 = vec_mladd(col6, quant6, zero);
+ col7 = vec_mladd(col7, quant7, zero);
+
+ DO_IDCT(col);
+
+ TRANSPOSE(out, row);
+ }
+
+ /* Pass 2: process rows. */
+
+ DO_IDCT(row);
+
+ out0 = vec_sra(out0, pass1_bits3);
+ out1 = vec_sra(out1, pass1_bits3);
+ out2 = vec_sra(out2, pass1_bits3);
+ out3 = vec_sra(out3, pass1_bits3);
+ out4 = vec_sra(out4, pass1_bits3);
+ out5 = vec_sra(out5, pass1_bits3);
+ out6 = vec_sra(out6, pass1_bits3);
+ out7 = vec_sra(out7, pass1_bits3);
+
+ TRANSPOSE(out, col);
+
+ outb = vec_packs(col0, col1);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[0] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[1] + output_col);
+ *outptr = outbptr[1];
+
+ outb = vec_packs(col2, col3);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[2] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[3] + output_col);
+ *outptr = outbptr[1];
+
+ outb = vec_packs(col4, col5);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[4] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[5] + output_col);
+ *outptr = outbptr[1];
+
+ outb = vec_packs(col6, col7);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[6] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[7] + output_col);
+ *outptr = outbptr[1];
+}
--- /dev/null
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* SLOW INTEGER INVERSE DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define CONST_BITS 13
+#define PASS1_BITS 2
+#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+
+
+#define DO_IDCT(in, PASS) \
+{ \
+ /* Even part */ \
+ \
+ in##26l = vec_mergeh(in##2, in##6); \
+ in##26h = vec_mergel(in##2, in##6); \
+ \
+ tmp3l = vec_msums(in##26l, pw_f130_f054, zero32); \
+ tmp3h = vec_msums(in##26h, pw_f130_f054, zero32); \
+ tmp2l = vec_msums(in##26l, pw_f054_mf130, zero32); \
+ tmp2h = vec_msums(in##26h, pw_f054_mf130, zero32); \
+ \
+ tmp0 = vec_add(in##0, in##4); \
+ tmp1 = vec_sub(in##0, in##4); \
+ \
+ tmp0l = vec_unpackh(tmp0); \
+ tmp0h = vec_unpackl(tmp0); \
+ tmp0l = vec_sl(tmp0l, const_bits); \
+ tmp0h = vec_sl(tmp0h, const_bits); \
+ tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+ tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
+ \
+ tmp10l = vec_add(tmp0l, tmp3l); \
+ tmp10h = vec_add(tmp0h, tmp3h); \
+ tmp13l = vec_sub(tmp0l, tmp3l); \
+ tmp13h = vec_sub(tmp0h, tmp3h); \
+ \
+ tmp1l = vec_unpackh(tmp1); \
+ tmp1h = vec_unpackl(tmp1); \
+ tmp1l = vec_sl(tmp1l, const_bits); \
+ tmp1h = vec_sl(tmp1h, const_bits); \
+ tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+ tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
+ \
+ tmp11l = vec_add(tmp1l, tmp2l); \
+ tmp11h = vec_add(tmp1h, tmp2h); \
+ tmp12l = vec_sub(tmp1l, tmp2l); \
+ tmp12h = vec_sub(tmp1h, tmp2h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(in##3, in##7); \
+ z4 = vec_add(in##1, in##5); \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, pw_mf078_f117, zero32); \
+ z3h = vec_msums(z34h, pw_mf078_f117, zero32); \
+ z4l = vec_msums(z34l, pw_f117_f078, zero32); \
+ z4h = vec_msums(z34h, pw_f117_f078, zero32); \
+ \
+ in##71l = vec_mergeh(in##7, in##1); \
+ in##71h = vec_mergel(in##7, in##1); \
+ \
+ tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+ tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+ tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+ tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
+ \
+ in##53l = vec_mergeh(in##5, in##3); \
+ in##53h = vec_mergel(in##5, in##3); \
+ \
+ tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+ tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+ tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+ tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
+ \
+ /* Final output stage */ \
+ \
+ out0l = vec_add(tmp10l, tmp3l); \
+ out0h = vec_add(tmp10h, tmp3h); \
+ out7l = vec_sub(tmp10l, tmp3l); \
+ out7h = vec_sub(tmp10h, tmp3h); \
+ \
+ out0l = vec_sra(out0l, descale_p##PASS); \
+ out0h = vec_sra(out0h, descale_p##PASS); \
+ out7l = vec_sra(out7l, descale_p##PASS); \
+ out7h = vec_sra(out7h, descale_p##PASS); \
+ \
+ out0 = vec_pack(out0l, out0h); \
+ out7 = vec_pack(out7l, out7h); \
+ \
+ out1l = vec_add(tmp11l, tmp2l); \
+ out1h = vec_add(tmp11h, tmp2h); \
+ out6l = vec_sub(tmp11l, tmp2l); \
+ out6h = vec_sub(tmp11h, tmp2h); \
+ \
+ out1l = vec_sra(out1l, descale_p##PASS); \
+ out1h = vec_sra(out1h, descale_p##PASS); \
+ out6l = vec_sra(out6l, descale_p##PASS); \
+ out6h = vec_sra(out6h, descale_p##PASS); \
+ \
+ out1 = vec_pack(out1l, out1h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ out2l = vec_add(tmp12l, tmp1l); \
+ out2h = vec_add(tmp12h, tmp1h); \
+ out5l = vec_sub(tmp12l, tmp1l); \
+ out5h = vec_sub(tmp12h, tmp1h); \
+ \
+ out2l = vec_sra(out2l, descale_p##PASS); \
+ out2h = vec_sra(out2h, descale_p##PASS); \
+ out5l = vec_sra(out5l, descale_p##PASS); \
+ out5h = vec_sra(out5h, descale_p##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out5 = vec_pack(out5l, out5h); \
+ \
+ out3l = vec_add(tmp13l, tmp0l); \
+ out3h = vec_add(tmp13h, tmp0h); \
+ out4l = vec_sub(tmp13l, tmp0l); \
+ out4h = vec_sub(tmp13h, tmp0h); \
+ \
+ out3l = vec_sra(out3l, descale_p##PASS); \
+ out3h = vec_sra(out3h, descale_p##PASS); \
+ out4l = vec_sra(out4l, descale_p##PASS); \
+ out4h = vec_sra(out4h, descale_p##PASS); \
+ \
+ out3 = vec_pack(out3l, out3h); \
+ out4 = vec_pack(out4l, out4h); \
+}
+
+
+void
+jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ short *dct_table = (short *)dct_table_;
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
+ tmp0, tmp1, tmp2, tmp3, z3, z4,
+ z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
+ row71l, row71h, row26l, row26h, row53l, row53h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
+ tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
+ z3l, z3h, z4l, z4h,
+ out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
+ out5l, out5h, out6l, out6h, out7l, out7h;
+ __vector signed char outb;
+ long long *outptr, *outbptr = (long long *)(&outb);
+
+ /* Constants */
+ __vector short zero16 = { __8X(0) },
+ pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+ pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+ pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+ pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+ pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+ pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+ pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+ pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
+ __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+ __vector int zero32 = { __4X(0) },
+ pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+ pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+ __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+ descale_p2 = { __4X(DESCALE_P2) },
+ const_bits = { __4X(CONST_BITS) };
+ __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
+
+ /* Pass 1: process columns. */
+
+ col0 = *(__vector short *)&coef_block[0];
+ col1 = *(__vector short *)&coef_block[8];
+ col2 = *(__vector short *)&coef_block[16];
+ col3 = *(__vector short *)&coef_block[24];
+ col4 = *(__vector short *)&coef_block[32];
+ col5 = *(__vector short *)&coef_block[40];
+ col6 = *(__vector short *)&coef_block[48];
+ col7 = *(__vector short *)&coef_block[56];
+
+ tmp1 = vec_or(col1, col2);
+ tmp2 = vec_or(col3, col4);
+ tmp1 = vec_or(tmp1, tmp2);
+ tmp3 = vec_or(col5, col6);
+ tmp3 = vec_or(tmp3, col7);
+ tmp1 = vec_or(tmp1, tmp3);
+
+ quant0 = *(__vector short *)&dct_table[0];
+ col0 = vec_mladd(col0, quant0, zero16);
+
+ if (vec_all_eq(tmp1, zero16)) {
+ /* AC terms all zero */
+
+ col0 = vec_sl(col0, pass1_bits);
+
+ row0 = vec_splat(col0, 0);
+ row1 = vec_splat(col0, 1);
+ row2 = vec_splat(col0, 2);
+ row3 = vec_splat(col0, 3);
+ row4 = vec_splat(col0, 4);
+ row5 = vec_splat(col0, 5);
+ row6 = vec_splat(col0, 6);
+ row7 = vec_splat(col0, 7);
+
+ } else {
+
+ quant1 = *(__vector short *)&dct_table[8];
+ quant2 = *(__vector short *)&dct_table[16];
+ quant3 = *(__vector short *)&dct_table[24];
+ quant4 = *(__vector short *)&dct_table[32];
+ quant5 = *(__vector short *)&dct_table[40];
+ quant6 = *(__vector short *)&dct_table[48];
+ quant7 = *(__vector short *)&dct_table[56];
+
+ col1 = vec_mladd(col1, quant1, zero16);
+ col2 = vec_mladd(col2, quant2, zero16);
+ col3 = vec_mladd(col3, quant3, zero16);
+ col4 = vec_mladd(col4, quant4, zero16);
+ col5 = vec_mladd(col5, quant5, zero16);
+ col6 = vec_mladd(col6, quant6, zero16);
+ col7 = vec_mladd(col7, quant7, zero16);
+
+ DO_IDCT(col, 1);
+
+ TRANSPOSE(out, row);
+ }
+
+ DO_IDCT(row, 2);
+
+ TRANSPOSE(out, col);
+
+ outb = vec_packs(col0, col1);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[0] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[1] + output_col);
+ *outptr = outbptr[1];
+
+ outb = vec_packs(col2, col3);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[2] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[3] + output_col);
+ *outptr = outbptr[1];
+
+ outb = vec_packs(col4, col5);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[4] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[5] + output_col);
+ *outptr = outbptr[1];
+
+ outb = vec_packs(col6, col7);
+ outb = vec_add(outb, pb_centerjsamp);
+ outptr = (long long *)(output_buf[6] + output_col);
+ *outptr = outbptr[0];
+ outptr = (long long *)(output_buf[7] + output_col);
+ *outptr = outbptr[1];
+}
--- /dev/null
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.
+ * All rights reserved.
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../jinclude.h"
+#include "../jpeglib.h"
+#include "../jsimd.h"
+#include "../jdct.h"
+#include "../jsimddct.h"
+#include "jsimd.h"
+#include <altivec.h>
+
+
+/* Common code */
+
+#define __4X(a) a, a, a, a
+#define __4X2(a, b) a, b, a, b, a, b, a, b
+#define __8X(a) __4X(a), __4X(a)
+#define __16X(a) __8X(a), __8X(a)
+
+#define TRANSPOSE(row, col) \
+{ \
+ __vector short row04l, row04h, row15l, row15h, \
+ row26l, row26h, row37l, row37h; \
+ __vector short col01e, col01o, col23e, col23o, \
+ col45e, col45o, col67e, col67o; \
+ \
+ /* transpose coefficients (phase 1) */ \
+ row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
+ row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
+ row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
+ row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
+ row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
+ row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
+ row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
+ row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
+ \
+ /* transpose coefficients (phase 2) */ \
+ col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
+ col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
+ col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
+ col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
+ col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
+ col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
+ col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
+ col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
+ \
+ /* transpose coefficients (phase 3) */ \
+ col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+ col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+ col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+ col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+ col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+ col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+ col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+ col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
+}
+++ /dev/null
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.
- * All rights reserved.
- * This software is provided 'as-is', without any express or implied
- * warranty. In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- * claim that you wrote the original software. If you use this software
- * in a product, an acknowledgment in the product documentation would be
- * appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- * misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-#include <altivec.h>
-
-
-/* Common code */
-
-#define __4X(a) a, a, a, a
-#define __4X2(a, b) a, b, a, b, a, b, a, b
-#define __8X(a) __4X(a), __4X(a)
-#define __16X(a) __8X(a), __8X(a)
-
-#define TRANSPOSE(row, col) \
-{ \
- __vector short row04l, row04h, row15l, row15h, \
- row26l, row26h, row37l, row37h; \
- __vector short col01e, col01o, col23e, col23o, \
- col45e, col45o, col67e, col67o; \
- \
- /* transpose coefficients (phase 1) */ \
- row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
- row04h = vec_mergel(row##0, row##4); /* row04h=(04 44 05 45 06 46 07 47) */ \
- row15l = vec_mergeh(row##1, row##5); /* row15l=(10 50 11 51 12 52 13 53) */ \
- row15h = vec_mergel(row##1, row##5); /* row15h=(14 54 15 55 16 56 17 57) */ \
- row26l = vec_mergeh(row##2, row##6); /* row26l=(20 60 21 61 22 62 23 63) */ \
- row26h = vec_mergel(row##2, row##6); /* row26h=(24 64 25 65 26 66 27 67) */ \
- row37l = vec_mergeh(row##3, row##7); /* row37l=(30 70 31 71 32 72 33 73) */ \
- row37h = vec_mergel(row##3, row##7); /* row37h=(34 74 35 75 36 76 37 77) */ \
- \
- /* transpose coefficients (phase 2) */ \
- col01e = vec_mergeh(row04l, row26l); /* col01e=(00 20 40 60 01 21 41 61) */ \
- col23e = vec_mergel(row04l, row26l); /* col23e=(02 22 42 62 03 23 43 63) */ \
- col45e = vec_mergeh(row04h, row26h); /* col45e=(04 24 44 64 05 25 45 65) */ \
- col67e = vec_mergel(row04h, row26h); /* col67e=(06 26 46 66 07 27 47 67) */ \
- col01o = vec_mergeh(row15l, row37l); /* col01o=(10 30 50 70 11 31 51 71) */ \
- col23o = vec_mergel(row15l, row37l); /* col23o=(12 32 52 72 13 33 53 73) */ \
- col45o = vec_mergeh(row15h, row37h); /* col45o=(14 34 54 74 15 35 55 75) */ \
- col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
- \
- /* transpose coefficients (phase 3) */ \
- col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
- col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
- col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
- col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
- col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
- col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
- col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
- col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
-}
-
-
-/* SLOW INTEGER FORWARD DCT */
-
-#define ISLOW_F_0_298 2446 /* FIX(0.298631336) */
-#define ISLOW_F_0_390 3196 /* FIX(0.390180644) */
-#define ISLOW_F_0_541 4433 /* FIX(0.541196100) */
-#define ISLOW_F_0_765 6270 /* FIX(0.765366865) */
-#define ISLOW_F_0_899 7373 /* FIX(0.899976223) */
-#define ISLOW_F_1_175 9633 /* FIX(1.175875602) */
-#define ISLOW_F_1_501 12299 /* FIX(1.501321110) */
-#define ISLOW_F_1_847 15137 /* FIX(1.847759065) */
-#define ISLOW_F_1_961 16069 /* FIX(1.961570560) */
-#define ISLOW_F_2_053 16819 /* FIX(2.053119869) */
-#define ISLOW_F_2_562 20995 /* FIX(2.562915447) */
-#define ISLOW_F_3_072 25172 /* FIX(3.072711026) */
-
-#define ISLOW_CONST_BITS 13
-#define ISLOW_PASS1_BITS 2
-#define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS)
-#define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS)
-
-#define DO_FDCT_ISLOW_COMMON(PASS) \
-{ \
- tmp1312l = vec_mergeh(tmp13, tmp12); \
- tmp1312h = vec_mergel(tmp13, tmp12); \
- \
- out2l = vec_msums(tmp1312l, PW_F130_F054, PD_DESCALE_P##PASS); \
- out2h = vec_msums(tmp1312h, PW_F130_F054, PD_DESCALE_P##PASS); \
- out6l = vec_msums(tmp1312l, PW_F054_MF130, PD_DESCALE_P##PASS); \
- out6h = vec_msums(tmp1312h, PW_F054_MF130, PD_DESCALE_P##PASS); \
- \
- out2l = vec_sra(out2l, DESCALE_P##PASS); \
- out2h = vec_sra(out2h, DESCALE_P##PASS); \
- out6l = vec_sra(out6l, DESCALE_P##PASS); \
- out6h = vec_sra(out6h, DESCALE_P##PASS); \
- \
- out2 = vec_pack(out2l, out2h); \
- out6 = vec_pack(out6l, out6h); \
- \
- /* Odd part */ \
- \
- z3 = vec_add(tmp4, tmp6); \
- z4 = vec_add(tmp5, tmp7); \
- \
- z34l = vec_mergeh(z3, z4); \
- z34h = vec_mergel(z3, z4); \
- \
- z3l = vec_msums(z34l, PW_MF078_F117, PD_DESCALE_P##PASS); \
- z3h = vec_msums(z34h, PW_MF078_F117, PD_DESCALE_P##PASS); \
- z4l = vec_msums(z34l, PW_F117_F078, PD_DESCALE_P##PASS); \
- z4h = vec_msums(z34h, PW_F117_F078, PD_DESCALE_P##PASS); \
- \
- tmp47l = vec_mergeh(tmp4, tmp7); \
- tmp47h = vec_mergel(tmp4, tmp7); \
- \
- out7l = vec_msums(tmp47l, PW_MF060_MF089, z3l); \
- out7h = vec_msums(tmp47h, PW_MF060_MF089, z3h); \
- out1l = vec_msums(tmp47l, PW_MF089_F060, z4l); \
- out1h = vec_msums(tmp47h, PW_MF089_F060, z4h); \
- \
- out7l = vec_sra(out7l, DESCALE_P##PASS); \
- out7h = vec_sra(out7h, DESCALE_P##PASS); \
- out1l = vec_sra(out1l, DESCALE_P##PASS); \
- out1h = vec_sra(out1h, DESCALE_P##PASS); \
- \
- out7 = vec_pack(out7l, out7h); \
- out1 = vec_pack(out1l, out1h); \
- \
- tmp56l = vec_mergeh(tmp5, tmp6); \
- tmp56h = vec_mergel(tmp5, tmp6); \
- \
- out5l = vec_msums(tmp56l, PW_MF050_MF256, z4l); \
- out5h = vec_msums(tmp56h, PW_MF050_MF256, z4h); \
- out3l = vec_msums(tmp56l, PW_MF256_F050, z3l); \
- out3h = vec_msums(tmp56h, PW_MF256_F050, z3h); \
- \
- out5l = vec_sra(out5l, DESCALE_P##PASS); \
- out5h = vec_sra(out5h, DESCALE_P##PASS); \
- out3l = vec_sra(out3l, DESCALE_P##PASS); \
- out3h = vec_sra(out3h, DESCALE_P##PASS); \
- \
- out5 = vec_pack(out5l, out5h); \
- out3 = vec_pack(out3l, out3h); \
-}
-
-#define DO_FDCT_ISLOW_ROWS() \
-{ \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out0 = vec_sl(out0, PASS1_BITS); \
- out4 = vec_sub(tmp10, tmp11); \
- out4 = vec_sl(out4, PASS1_BITS); \
- \
- DO_FDCT_ISLOW_COMMON(1); \
-}
-
-#define DO_FDCT_ISLOW_COLS() \
-{ \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out0 = vec_add(out0, PW_DESCALE_P2X); \
- out0 = vec_sra(out0, PASS1_BITS); \
- out4 = vec_sub(tmp10, tmp11); \
- out4 = vec_add(out4, PW_DESCALE_P2X); \
- out4 = vec_sra(out4, PASS1_BITS); \
- \
- DO_FDCT_ISLOW_COMMON(2); \
-}
-
-void
-jsimd_fdct_islow_altivec (DCTELEM *data)
-{
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
- tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
- z3, z4, z34l, z34h,
- out0, out1, out2, out3, out4, out5, out6, out7;
- __vector int z3l, z3h, z4l, z4h,
- out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
- out7l, out7h;
-
- /* Constants */
- __vector short
- PW_F130_F054 = { __4X2(ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541) },
- PW_F054_MF130 = { __4X2(ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847) },
- PW_MF078_F117 = { __4X2(ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175) },
- PW_F117_F078 = { __4X2(ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390) },
- PW_MF060_MF089 = { __4X2(ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899) },
- PW_MF089_F060 = { __4X2(-ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899) },
- PW_MF050_MF256 = { __4X2(ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562) },
- PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) },
- PW_DESCALE_P2X = { __8X(1 << (ISLOW_PASS1_BITS - 1)) };
- __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
- __vector int PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
- PD_DESCALE_P2 = { __4X(1 << (ISLOW_DESCALE_P2 - 1)) };
- __vector unsigned int DESCALE_P1 = { __4X(ISLOW_DESCALE_P1) },
- DESCALE_P2 = { __4X(ISLOW_DESCALE_P2) };
-
- /* Pass 1: process rows. */
-
- row0 = *(__vector short *)&data[0];
- row1 = *(__vector short *)&data[8];
- row2 = *(__vector short *)&data[16];
- row3 = *(__vector short *)&data[24];
- row4 = *(__vector short *)&data[32];
- row5 = *(__vector short *)&data[40];
- row6 = *(__vector short *)&data[48];
- row7 = *(__vector short *)&data[56];
-
- TRANSPOSE(row, col);
-
- tmp0 = vec_add(col0, col7);
- tmp7 = vec_sub(col0, col7);
- tmp1 = vec_add(col1, col6);
- tmp6 = vec_sub(col1, col6);
- tmp2 = vec_add(col2, col5);
- tmp5 = vec_sub(col2, col5);
- tmp3 = vec_add(col3, col4);
- tmp4 = vec_sub(col3, col4);
-
- DO_FDCT_ISLOW_ROWS();
-
- /* Pass 2: process columns. */
-
- TRANSPOSE(out, row);
-
- tmp0 = vec_add(row0, row7);
- tmp7 = vec_sub(row0, row7);
- tmp1 = vec_add(row1, row6);
- tmp6 = vec_sub(row1, row6);
- tmp2 = vec_add(row2, row5);
- tmp5 = vec_sub(row2, row5);
- tmp3 = vec_add(row3, row4);
- tmp4 = vec_sub(row3, row4);
-
- DO_FDCT_ISLOW_COLS();
-
- *(__vector short *)&data[0] = out0;
- *(__vector short *)&data[8] = out1;
- *(__vector short *)&data[16] = out2;
- *(__vector short *)&data[24] = out3;
- *(__vector short *)&data[32] = out4;
- *(__vector short *)&data[40] = out5;
- *(__vector short *)&data[48] = out6;
- *(__vector short *)&data[56] = out7;
-}
-
-
-/* FAST INTEGER FORWARD DCT
- *
- * This is similar to the SSE2 implementation, except that we left-shift the
- * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.) This is because
- * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
- * the elements in arg3 + the most significant 17 bits of
- * (the elements in arg1 * the elements in arg2).
- */
-
-#define IFAST_F_0_382 98 /* FIX(0.382683433) */
-#define IFAST_F_0_541 139 /* FIX(0.541196100) */
-#define IFAST_F_0_707 181 /* FIX(0.707106781) */
-#define IFAST_F_1_306 334 /* FIX(1.306562965) */
-
-#define IFAST_CONST_BITS 8
-#define IFAST_PRE_MULTIPLY_SCALE_BITS 2
-#define IFAST_CONST_SHIFT \
- (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
-
-#define DO_FDCT_IFAST() \
-{ \
- /* Even part */ \
- \
- tmp10 = vec_add(tmp0, tmp3); \
- tmp13 = vec_sub(tmp0, tmp3); \
- tmp11 = vec_add(tmp1, tmp2); \
- tmp12 = vec_sub(tmp1, tmp2); \
- \
- out0 = vec_add(tmp10, tmp11); \
- out4 = vec_sub(tmp10, tmp11); \
- \
- z1 = vec_add(tmp12, tmp13); \
- z1 = vec_sl(z1, PRE_MULTIPLY_SCALE_BITS); \
- z1 = vec_madds(z1, PW_0707, zero); \
- \
- out2 = vec_add(tmp13, z1); \
- out6 = vec_sub(tmp13, z1); \
- \
- /* Odd part */ \
- \
- tmp10 = vec_add(tmp4, tmp5); \
- tmp11 = vec_add(tmp5, tmp6); \
- tmp12 = vec_add(tmp6, tmp7); \
- \
- tmp10 = vec_sl(tmp10, PRE_MULTIPLY_SCALE_BITS); \
- tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS); \
- z5 = vec_sub(tmp10, tmp12); \
- z5 = vec_madds(z5, PW_0382, zero); \
- \
- z2 = vec_madds(tmp10, PW_0541, z5); \
- z4 = vec_madds(tmp12, PW_1306, z5); \
- \
- tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS); \
- z3 = vec_madds(tmp11, PW_0707, zero); \
- \
- z11 = vec_add(tmp7, z3); \
- z13 = vec_sub(tmp7, z3); \
- \
- out5 = vec_add(z13, z2); \
- out3 = vec_sub(z13, z2); \
- out1 = vec_add(z11, z4); \
- out7 = vec_sub(z11, z4); \
-}
-
-void
-jsimd_fdct_ifast_altivec (DCTELEM *data)
-{
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
- z1, z2, z3, z4, z5, z11, z13,
- out0, out1, out2, out3, out4, out5, out6, out7;
-
- /* Constants */
- __vector short zero = vec_splat_s16(0),
- PW_0382 = { __8X(IFAST_F_0_382 << IFAST_CONST_SHIFT) },
- PW_0541 = { __8X(IFAST_F_0_541 << IFAST_CONST_SHIFT) },
- PW_0707 = { __8X(IFAST_F_0_707 << IFAST_CONST_SHIFT) },
- PW_1306 = { __8X(IFAST_F_1_306 << IFAST_CONST_SHIFT) };
- __vector unsigned short
- PRE_MULTIPLY_SCALE_BITS = { __8X(IFAST_PRE_MULTIPLY_SCALE_BITS) };
-
- /* Pass 1: process rows. */
-
- row0 = *(__vector short *)&data[0];
- row1 = *(__vector short *)&data[8];
- row2 = *(__vector short *)&data[16];
- row3 = *(__vector short *)&data[24];
- row4 = *(__vector short *)&data[32];
- row5 = *(__vector short *)&data[40];
- row6 = *(__vector short *)&data[48];
- row7 = *(__vector short *)&data[56];
-
- TRANSPOSE(row, col);
-
- tmp0 = vec_add(col0, col7);
- tmp7 = vec_sub(col0, col7);
- tmp1 = vec_add(col1, col6);
- tmp6 = vec_sub(col1, col6);
- tmp2 = vec_add(col2, col5);
- tmp5 = vec_sub(col2, col5);
- tmp3 = vec_add(col3, col4);
- tmp4 = vec_sub(col3, col4);
-
- DO_FDCT_IFAST();
-
- /* Pass 2: process columns. */
-
- TRANSPOSE(out, row);
-
- tmp0 = vec_add(row0, row7);
- tmp7 = vec_sub(row0, row7);
- tmp1 = vec_add(row1, row6);
- tmp6 = vec_sub(row1, row6);
- tmp2 = vec_add(row2, row5);
- tmp5 = vec_sub(row2, row5);
- tmp3 = vec_add(row3, row4);
- tmp4 = vec_sub(row3, row4);
-
- DO_FDCT_IFAST();
-
- *(__vector short *)&data[0] = out0;
- *(__vector short *)&data[8] = out1;
- *(__vector short *)&data[16] = out2;
- *(__vector short *)&data[24] = out3;
- *(__vector short *)&data[32] = out4;
- *(__vector short *)&data[40] = out5;
- *(__vector short *)&data[48] = out6;
- *(__vector short *)&data[56] = out7;
-}
-
-
-/* SLOW INTEGER INVERSE DCT */
-
-#define DO_IDCT_ISLOW(in, PASS) \
-{ \
- /* Even part */ \
- \
- in##26l = vec_mergeh(in##2, in##6); \
- in##26h = vec_mergel(in##2, in##6); \
- \
- tmp3l = vec_msums(in##26l, PW_F130_F054, zero32); \
- tmp3h = vec_msums(in##26h, PW_F130_F054, zero32); \
- tmp2l = vec_msums(in##26l, PW_F054_MF130, zero32); \
- tmp2h = vec_msums(in##26h, PW_F054_MF130, zero32); \
- \
- tmp0 = vec_add(in##0, in##4); \
- tmp1 = vec_sub(in##0, in##4); \
- \
- tmp0l = vec_unpackh(tmp0); \
- tmp0h = vec_unpackl(tmp0); \
- tmp0l = vec_sl(tmp0l, CONST_BITS); \
- tmp0h = vec_sl(tmp0h, CONST_BITS); \
- tmp0l = vec_add(tmp0l, PD_DESCALE_P##PASS); \
- tmp0h = vec_add(tmp0h, PD_DESCALE_P##PASS); \
- \
- tmp10l = vec_add(tmp0l, tmp3l); \
- tmp10h = vec_add(tmp0h, tmp3h); \
- tmp13l = vec_sub(tmp0l, tmp3l); \
- tmp13h = vec_sub(tmp0h, tmp3h); \
- \
- tmp1l = vec_unpackh(tmp1); \
- tmp1h = vec_unpackl(tmp1); \
- tmp1l = vec_sl(tmp1l, CONST_BITS); \
- tmp1h = vec_sl(tmp1h, CONST_BITS); \
- tmp1l = vec_add(tmp1l, PD_DESCALE_P##PASS); \
- tmp1h = vec_add(tmp1h, PD_DESCALE_P##PASS); \
- \
- tmp11l = vec_add(tmp1l, tmp2l); \
- tmp11h = vec_add(tmp1h, tmp2h); \
- tmp12l = vec_sub(tmp1l, tmp2l); \
- tmp12h = vec_sub(tmp1h, tmp2h); \
- \
- /* Odd part */ \
- \
- z3 = vec_add(in##3, in##7); \
- z4 = vec_add(in##1, in##5); \
- \
- z34l = vec_mergeh(z3, z4); \
- z34h = vec_mergel(z3, z4); \
- \
- z3l = vec_msums(z34l, PW_MF078_F117, zero32); \
- z3h = vec_msums(z34h, PW_MF078_F117, zero32); \
- z4l = vec_msums(z34l, PW_F117_F078, zero32); \
- z4h = vec_msums(z34h, PW_F117_F078, zero32); \
- \
- in##71l = vec_mergeh(in##7, in##1); \
- in##71h = vec_mergel(in##7, in##1); \
- \
- tmp0l = vec_msums(in##71l, PW_MF060_MF089, z3l); \
- tmp0h = vec_msums(in##71h, PW_MF060_MF089, z3h); \
- tmp3l = vec_msums(in##71l, PW_MF089_F060, z4l); \
- tmp3h = vec_msums(in##71h, PW_MF089_F060, z4h); \
- \
- in##53l = vec_mergeh(in##5, in##3); \
- in##53h = vec_mergel(in##5, in##3); \
- \
- tmp1l = vec_msums(in##53l, PW_MF050_MF256, z4l); \
- tmp1h = vec_msums(in##53h, PW_MF050_MF256, z4h); \
- tmp2l = vec_msums(in##53l, PW_MF256_F050, z3l); \
- tmp2h = vec_msums(in##53h, PW_MF256_F050, z3h); \
- \
- /* Final output stage */ \
- \
- out0l = vec_add(tmp10l, tmp3l); \
- out0h = vec_add(tmp10h, tmp3h); \
- out7l = vec_sub(tmp10l, tmp3l); \
- out7h = vec_sub(tmp10h, tmp3h); \
- \
- out0l = vec_sra(out0l, DESCALE_P##PASS); \
- out0h = vec_sra(out0h, DESCALE_P##PASS); \
- out7l = vec_sra(out7l, DESCALE_P##PASS); \
- out7h = vec_sra(out7h, DESCALE_P##PASS); \
- \
- out0 = vec_pack(out0l, out0h); \
- out7 = vec_pack(out7l, out7h); \
- \
- out1l = vec_add(tmp11l, tmp2l); \
- out1h = vec_add(tmp11h, tmp2h); \
- out6l = vec_sub(tmp11l, tmp2l); \
- out6h = vec_sub(tmp11h, tmp2h); \
- \
- out1l = vec_sra(out1l, DESCALE_P##PASS); \
- out1h = vec_sra(out1h, DESCALE_P##PASS); \
- out6l = vec_sra(out6l, DESCALE_P##PASS); \
- out6h = vec_sra(out6h, DESCALE_P##PASS); \
- \
- out1 = vec_pack(out1l, out1h); \
- out6 = vec_pack(out6l, out6h); \
- \
- out2l = vec_add(tmp12l, tmp1l); \
- out2h = vec_add(tmp12h, tmp1h); \
- out5l = vec_sub(tmp12l, tmp1l); \
- out5h = vec_sub(tmp12h, tmp1h); \
- \
- out2l = vec_sra(out2l, DESCALE_P##PASS); \
- out2h = vec_sra(out2h, DESCALE_P##PASS); \
- out5l = vec_sra(out5l, DESCALE_P##PASS); \
- out5h = vec_sra(out5h, DESCALE_P##PASS); \
- \
- out2 = vec_pack(out2l, out2h); \
- out5 = vec_pack(out5l, out5h); \
- \
- out3l = vec_add(tmp13l, tmp0l); \
- out3h = vec_add(tmp13h, tmp0h); \
- out4l = vec_sub(tmp13l, tmp0l); \
- out4h = vec_sub(tmp13h, tmp0h); \
- \
- out3l = vec_sra(out3l, DESCALE_P##PASS); \
- out3h = vec_sra(out3h, DESCALE_P##PASS); \
- out4l = vec_sra(out4l, DESCALE_P##PASS); \
- out4h = vec_sra(out4h, DESCALE_P##PASS); \
- \
- out3 = vec_pack(out3l, out3h); \
- out4 = vec_pack(out4l, out4h); \
-}
-
-void
-jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
-{
- short *dct_table = (short *)dct_table_;
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
- tmp0, tmp1, tmp2, tmp3, z3, z4,
- z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
- row71l, row71h, row26l, row26h, row53l, row53h,
- out0, out1, out2, out3, out4, out5, out6, out7;
- __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
- tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
- z3l, z3h, z4l, z4h,
- out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
- out5l, out5h, out6l, out6h, out7l, out7h;
- __vector signed char outb;
- long long *outptr, *outbptr = (long long *)(&outb);
-
- /* Constants */
- __vector short zero16 = { __8X(0) },
- PW_F130_F054 = { __4X2(ISLOW_F_0_541 + ISLOW_F_0_765, ISLOW_F_0_541) },
- PW_F054_MF130 = { __4X2(ISLOW_F_0_541, ISLOW_F_0_541 - ISLOW_F_1_847) },
- PW_MF078_F117 = { __4X2(ISLOW_F_1_175 - ISLOW_F_1_961, ISLOW_F_1_175) },
- PW_F117_F078 = { __4X2(ISLOW_F_1_175, ISLOW_F_1_175 - ISLOW_F_0_390) },
- PW_MF060_MF089 = { __4X2(ISLOW_F_0_298 - ISLOW_F_0_899, -ISLOW_F_0_899) },
- PW_MF089_F060 = { __4X2(-ISLOW_F_0_899, ISLOW_F_1_501 - ISLOW_F_0_899) },
- PW_MF050_MF256 = { __4X2(ISLOW_F_2_053 - ISLOW_F_2_562, -ISLOW_F_2_562) },
- PW_MF256_F050 = { __4X2(-ISLOW_F_2_562, ISLOW_F_3_072 - ISLOW_F_2_562) };
- __vector unsigned short PASS1_BITS = { __8X(ISLOW_PASS1_BITS) };
- __vector int zero32 = { __4X(0) },
- PD_DESCALE_P1 = { __4X(1 << (ISLOW_DESCALE_P1 - 1)) },
- PD_DESCALE_P2 = { __4X(1 << (ISLOW_DESCALE_P2 + 3 - 1)) };
- __vector unsigned int DESCALE_P1 = { __4X(ISLOW_DESCALE_P1) },
- DESCALE_P2 = { __4X(ISLOW_DESCALE_P2 + 3) },
- CONST_BITS = { __4X(ISLOW_CONST_BITS) };
- __vector signed char PB_CENTERJSAMP = { __16X(CENTERJSAMPLE) };
-
- /* Pass 1: process columns. */
-
- col0 = *(__vector short *)&coef_block[0];
- col1 = *(__vector short *)&coef_block[8];
- col2 = *(__vector short *)&coef_block[16];
- col3 = *(__vector short *)&coef_block[24];
- col4 = *(__vector short *)&coef_block[32];
- col5 = *(__vector short *)&coef_block[40];
- col6 = *(__vector short *)&coef_block[48];
- col7 = *(__vector short *)&coef_block[56];
-
- tmp1 = vec_or(col1, col2);
- tmp2 = vec_or(col3, col4);
- tmp1 = vec_or(tmp1, tmp2);
- tmp3 = vec_or(col5, col6);
- tmp3 = vec_or(tmp3, col7);
- tmp1 = vec_or(tmp1, tmp3);
-
- quant0 = *(__vector short *)&dct_table[0];
- col0 = vec_mladd(col0, quant0, zero16);
-
- if (vec_all_eq(tmp1, zero16)) {
- /* AC terms all zero */
-
- col0 = vec_sl(col0, PASS1_BITS);
-
- row0 = vec_splat(col0, 0);
- row1 = vec_splat(col0, 1);
- row2 = vec_splat(col0, 2);
- row3 = vec_splat(col0, 3);
- row4 = vec_splat(col0, 4);
- row5 = vec_splat(col0, 5);
- row6 = vec_splat(col0, 6);
- row7 = vec_splat(col0, 7);
-
- } else {
-
- quant1 = *(__vector short *)&dct_table[8];
- quant2 = *(__vector short *)&dct_table[16];
- quant3 = *(__vector short *)&dct_table[24];
- quant4 = *(__vector short *)&dct_table[32];
- quant5 = *(__vector short *)&dct_table[40];
- quant6 = *(__vector short *)&dct_table[48];
- quant7 = *(__vector short *)&dct_table[56];
-
- col1 = vec_mladd(col1, quant1, zero16);
- col2 = vec_mladd(col2, quant2, zero16);
- col3 = vec_mladd(col3, quant3, zero16);
- col4 = vec_mladd(col4, quant4, zero16);
- col5 = vec_mladd(col5, quant5, zero16);
- col6 = vec_mladd(col6, quant6, zero16);
- col7 = vec_mladd(col7, quant7, zero16);
-
- DO_IDCT_ISLOW(col, 1);
-
- TRANSPOSE(out, row);
- }
-
- DO_IDCT_ISLOW(row, 2);
-
- TRANSPOSE(out, col);
-
- outb = vec_packs(col0, col1);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[0] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[1] + output_col);
- *outptr = outbptr[1];
-
- outb = vec_packs(col2, col3);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[2] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[3] + output_col);
- *outptr = outbptr[1];
-
- outb = vec_packs(col4, col5);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[4] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[5] + output_col);
- *outptr = outbptr[1];
-
- outb = vec_packs(col6, col7);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[6] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[7] + output_col);
- *outptr = outbptr[1];
-}
-
-
-/* FAST INTEGER INVERSE DCT
- *
- * This is similar to the SSE2 implementation, except that we left-shift the
- * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.) This is because
- * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
- * the elements in arg3 + the most significant 17 bits of
- * (the elements in arg1 * the elements in arg2).
- */
-
-#define IFAST_F_1_082 277 /* FIX(1.082392200) */
-#define IFAST_F_1_414 362 /* FIX(1.414213562) */
-#define IFAST_F_1_847 473 /* FIX(1.847759065) */
-#define IFAST_F_2_613 669 /* FIX(2.613125930) */
-#define IFAST_F_1_613 (IFAST_F_2_613 - 256) /* FIX(2.613125930) - FIX(1) */
-
-#define IFAST_PASS1_BITS 2
-
-#define DO_IDCT_IFAST(in) \
-{ \
- /* Even part */ \
- \
- tmp10 = vec_add(in##0, in##4); \
- tmp11 = vec_sub(in##0, in##4); \
- tmp13 = vec_add(in##2, in##6); \
- \
- tmp12 = vec_sub(in##2, in##6); \
- tmp12 = vec_sl(tmp12, PRE_MULTIPLY_SCALE_BITS); \
- tmp12 = vec_madds(tmp12, PW_F1414, zero); \
- tmp12 = vec_sub(tmp12, tmp13); \
- \
- tmp0 = vec_add(tmp10, tmp13); \
- tmp3 = vec_sub(tmp10, tmp13); \
- tmp1 = vec_add(tmp11, tmp12); \
- tmp2 = vec_sub(tmp11, tmp12); \
- \
- /* Odd part */ \
- \
- z13 = vec_add(in##5, in##3); \
- z10 = vec_sub(in##5, in##3); \
- z10s = vec_sl(z10, PRE_MULTIPLY_SCALE_BITS); \
- z11 = vec_add(in##1, in##7); \
- z12s = vec_sub(in##1, in##7); \
- z12s = vec_sl(z12s, PRE_MULTIPLY_SCALE_BITS); \
- \
- tmp11 = vec_sub(z11, z13); \
- tmp11 = vec_sl(tmp11, PRE_MULTIPLY_SCALE_BITS); \
- tmp11 = vec_madds(tmp11, PW_F1414, zero); \
- \
- tmp7 = vec_add(z11, z13); \
- \
- z5 = vec_add(z10s, z12s); \
- z5 = vec_madds(z5, PW_F1847, zero); \
- \
- tmp10 = vec_madds(z12s, PW_F1082, zero); \
- tmp10 = vec_sub(tmp10, z5); \
- tmp12 = vec_madds(z10s, PW_MF1613, z5); \
- tmp12 = vec_sub(tmp12, z10); \
- \
- tmp6 = vec_sub(tmp12, tmp7); \
- tmp5 = vec_sub(tmp11, tmp6); \
- tmp4 = vec_add(tmp10, tmp5); \
- \
- out0 = vec_add(tmp0, tmp7); \
- out1 = vec_add(tmp1, tmp6); \
- out2 = vec_add(tmp2, tmp5); \
- out3 = vec_sub(tmp3, tmp4); \
- out4 = vec_add(tmp3, tmp4); \
- out5 = vec_sub(tmp2, tmp5); \
- out6 = vec_sub(tmp1, tmp6); \
- out7 = vec_sub(tmp0, tmp7); \
-}
-
-void
-jsimd_idct_ifast_altivec (void * dct_table_, JCOEFPTR coef_block,
- JSAMPARRAY output_buf, JDIMENSION output_col)
-{
- short *dct_table = (short *)dct_table_;
- __vector short row0, row1, row2, row3, row4, row5, row6, row7,
- col0, col1, col2, col3, col4, col5, col6, col7,
- quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
- z5, z10, z10s, z11, z12s, z13,
- out0, out1, out2, out3, out4, out5, out6, out7;
- __vector signed char outb;
- long long *outptr, *outbptr = (long long *)(&outb);
-
- /* Constants */
- __vector short zero = { __8X(0) },
- PW_F1414 = { __8X(IFAST_F_1_414 << IFAST_CONST_SHIFT) },
- PW_F1847 = { __8X(IFAST_F_1_847 << IFAST_CONST_SHIFT) },
- PW_MF1613 = { __8X(-IFAST_F_1_613 << IFAST_CONST_SHIFT) },
- PW_F1082 = { __8X(IFAST_F_1_082 << IFAST_CONST_SHIFT) };
- __vector unsigned short
- PRE_MULTIPLY_SCALE_BITS = { __8X(IFAST_PRE_MULTIPLY_SCALE_BITS) },
- PASS1_BITS3 = { __8X(IFAST_PASS1_BITS + 3) };
- __vector signed char PB_CENTERJSAMP = { __16X(CENTERJSAMPLE) };
-
- /* Pass 1: process columns. */
-
- col0 = *(__vector short *)&coef_block[0];
- col1 = *(__vector short *)&coef_block[8];
- col2 = *(__vector short *)&coef_block[16];
- col3 = *(__vector short *)&coef_block[24];
- col4 = *(__vector short *)&coef_block[32];
- col5 = *(__vector short *)&coef_block[40];
- col6 = *(__vector short *)&coef_block[48];
- col7 = *(__vector short *)&coef_block[56];
-
- tmp1 = vec_or(col1, col2);
- tmp2 = vec_or(col3, col4);
- tmp1 = vec_or(tmp1, tmp2);
- tmp3 = vec_or(col5, col6);
- tmp3 = vec_or(tmp3, col7);
- tmp1 = vec_or(tmp1, tmp3);
-
- quant0 = *(__vector short *)&dct_table[0];
- col0 = vec_mladd(col0, quant0, zero);
-
- if (vec_all_eq(tmp1, zero)) {
- /* AC terms all zero */
-
- row0 = vec_splat(col0, 0);
- row1 = vec_splat(col0, 1);
- row2 = vec_splat(col0, 2);
- row3 = vec_splat(col0, 3);
- row4 = vec_splat(col0, 4);
- row5 = vec_splat(col0, 5);
- row6 = vec_splat(col0, 6);
- row7 = vec_splat(col0, 7);
-
- } else {
-
- quant1 = *(__vector short *)&dct_table[8];
- quant2 = *(__vector short *)&dct_table[16];
- quant3 = *(__vector short *)&dct_table[24];
- quant4 = *(__vector short *)&dct_table[32];
- quant5 = *(__vector short *)&dct_table[40];
- quant6 = *(__vector short *)&dct_table[48];
- quant7 = *(__vector short *)&dct_table[56];
-
- col1 = vec_mladd(col1, quant1, zero);
- col2 = vec_mladd(col2, quant2, zero);
- col3 = vec_mladd(col3, quant3, zero);
- col4 = vec_mladd(col4, quant4, zero);
- col5 = vec_mladd(col5, quant5, zero);
- col6 = vec_mladd(col6, quant6, zero);
- col7 = vec_mladd(col7, quant7, zero);
-
- DO_IDCT_IFAST(col);
-
- TRANSPOSE(out, row);
- }
-
- /* Pass 2: process rows. */
-
- DO_IDCT_IFAST(row);
-
- out0 = vec_sra(out0, PASS1_BITS3);
- out1 = vec_sra(out1, PASS1_BITS3);
- out2 = vec_sra(out2, PASS1_BITS3);
- out3 = vec_sra(out3, PASS1_BITS3);
- out4 = vec_sra(out4, PASS1_BITS3);
- out5 = vec_sra(out5, PASS1_BITS3);
- out6 = vec_sra(out6, PASS1_BITS3);
- out7 = vec_sra(out7, PASS1_BITS3);
-
- TRANSPOSE(out, col);
-
- outb = vec_packs(col0, col1);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[0] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[1] + output_col);
- *outptr = outbptr[1];
-
- outb = vec_packs(col2, col3);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[2] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[3] + output_col);
- *outptr = outbptr[1];
-
- outb = vec_packs(col4, col5);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[4] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[5] + output_col);
- *outptr = outbptr[1];
-
- outb = vec_packs(col6, col7);
- outb = vec_add(outb, PB_CENTERJSAMP);
- outptr = (long long *)(output_buf[6] + output_col);
- *outptr = outbptr[0];
- outptr = (long long *)(output_buf[7] + output_col);
- *outptr = outbptr[1];
-}