#include "jsimd.h"
#include <altivec.h>
+
+/* Common code */
+
#define TRANSPOSE(row, col) \
{ \
__vector short row04l, row04h, row15l, row15h, \
col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
}
-static const __vector short constants __attribute__((aligned(16))) =
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.) This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ * the elements in arg3 + the most significant 17 bits of
+ * (the elements in arg1 * the elements in arg2).
+ */
+
+#define IFAST_CONST_BITS 8
+#define IFAST_PRE_MULTIPLY_SCALE_BITS 2
+#define IFAST_CONST_SHIFT \
+ (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
+
+static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) =
{
- 98 << 5, /* FIX(0.382683433) */
- 139 << 5, /* FIX(0.541196100) */
- 181 << 5, /* FIX(0.707106781) */
- 334 << 5 /* FIX(1.306562965) */
+ 98 << IFAST_CONST_SHIFT, /* FIX(0.382683433) */
+ 139 << IFAST_CONST_SHIFT, /* FIX(0.541196100) */
+ 181 << IFAST_CONST_SHIFT, /* FIX(0.707106781) */
+ 334 << IFAST_CONST_SHIFT /* FIX(1.306562965) */
};
-#define DO_DCT() \
+#define DO_FDCT_IFAST() \
{ \
/* Even part */ \
\
/* Constants */
__vector short zero = vec_splat_s16(0),
- PW_0382 = vec_splat(constants, 0),
- PW_0541 = vec_splat(constants, 1),
- PW_0707 = vec_splat(constants, 2),
- PW_1306 = vec_splat(constants, 3);
- __vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2);
+ PW_0382 = vec_splat(jconst_fdct_ifast, 0),
+ PW_0541 = vec_splat(jconst_fdct_ifast, 1),
+ PW_0707 = vec_splat(jconst_fdct_ifast, 2),
+ PW_1306 = vec_splat(jconst_fdct_ifast, 3);
+ __vector unsigned short PRE_MULTIPLY_SCALE_BITS =
+ vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS);
+
+ /* Pass 1: process rows. */
+
+ row0 = *(__vector short *)&data[0];
+ row1 = *(__vector short *)&data[8];
+ row2 = *(__vector short *)&data[16];
+ row3 = *(__vector short *)&data[24];
+ row4 = *(__vector short *)&data[32];
+ row5 = *(__vector short *)&data[40];
+ row6 = *(__vector short *)&data[48];
+ row7 = *(__vector short *)&data[56];
+
+ TRANSPOSE(row, col);
+
+ tmp0 = vec_add(col0, col7);
+ tmp7 = vec_sub(col0, col7);
+ tmp1 = vec_add(col1, col6);
+ tmp6 = vec_sub(col1, col6);
+ tmp2 = vec_add(col2, col5);
+ tmp5 = vec_sub(col2, col5);
+ tmp3 = vec_add(col3, col4);
+ tmp4 = vec_sub(col3, col4);
+
+ DO_FDCT_IFAST();
+
+ /* Pass 2: process columns. */
+
+ TRANSPOSE(out, row);
+
+ tmp0 = vec_add(row0, row7);
+ tmp7 = vec_sub(row0, row7);
+ tmp1 = vec_add(row1, row6);
+ tmp6 = vec_sub(row1, row6);
+ tmp2 = vec_add(row2, row5);
+ tmp5 = vec_sub(row2, row5);
+ tmp3 = vec_add(row3, row4);
+ tmp4 = vec_sub(row3, row4);
+
+ DO_FDCT_IFAST();
+
+ *(__vector short *)&data[0] = out0;
+ *(__vector short *)&data[8] = out1;
+ *(__vector short *)&data[16] = out2;
+ *(__vector short *)&data[24] = out3;
+ *(__vector short *)&data[32] = out4;
+ *(__vector short *)&data[40] = out5;
+ *(__vector short *)&data[48] = out6;
+ *(__vector short *)&data[56] = out7;
+}
+
+
+/* SLOW INTEGER FORWARD DCT */
+
+#define F_0_298 2446 /* FIX(0.298631336) */
+#define F_0_390 3196 /* FIX(0.390180644) */
+#define F_0_541 4433 /* FIX(0.541196100) */
+#define F_0_765 6270 /* FIX(0.765366865) */
+#define F_0_899 7373 /* FIX(0.899976223) */
+#define F_1_175 9633 /* FIX(1.175875602) */
+#define F_1_501 12299 /* FIX(1.501321110) */
+#define F_1_847 15137 /* FIX(1.847759065) */
+#define F_1_961 16069 /* FIX(1.961570560) */
+#define F_2_053 16819 /* FIX(2.053119869) */
+#define F_2_562 20995 /* FIX(2.562915447) */
+#define F_3_072 25172 /* FIX(3.072711026) */
+
+#define ISLOW_CONST_BITS 13
+#define ISLOW_PASS1_BITS 2
+#define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS)
+#define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS)
+
+static const __vector int jconst_fdct_islow __attribute__((aligned(16))) =
+{
+ 1 << (ISLOW_DESCALE_P1 - 1),
+ 1 << (ISLOW_DESCALE_P2 - 1)
+};
+
+static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) =
+{
+ 1 << (ISLOW_PASS1_BITS - 1)
+};
+
+#define DO_FDCT_ISLOW_COMMON(PASS) \
+{ \
+ tmp1312l = vec_mergeh(tmp13, tmp12); \
+ tmp1312h = vec_mergel(tmp13, tmp12); \
+ \
+ out2l = vec_msums(tmp1312l, PW_F130_F054, zero); \
+ out2h = vec_msums(tmp1312h, PW_F130_F054, zero); \
+ out6l = vec_msums(tmp1312l, PW_F054_MF130, zero); \
+ out6h = vec_msums(tmp1312h, PW_F054_MF130, zero); \
+ \
+ out2l = vec_add(out2l, PD_DESCALE_P##PASS); \
+ out2h = vec_add(out2h, PD_DESCALE_P##PASS); \
+ out2l = vec_sr(out2l, DESCALE_P##PASS); \
+ out2h = vec_sr(out2h, DESCALE_P##PASS); \
+ \
+ out6l = vec_add(out6l, PD_DESCALE_P##PASS); \
+ out6h = vec_add(out6h, PD_DESCALE_P##PASS); \
+ out6l = vec_sr(out6l, DESCALE_P##PASS); \
+ out6h = vec_sr(out6h, DESCALE_P##PASS); \
+ \
+ out2 = vec_pack(out2l, out2h); \
+ out6 = vec_pack(out6l, out6h); \
+ \
+ /* Odd part */ \
+ \
+ z3 = vec_add(tmp4, tmp6); \
+ z4 = vec_add(tmp5, tmp7); \
+ \
+ z34l = vec_mergeh(z3, z4); \
+ z34h = vec_mergel(z3, z4); \
+ \
+ z3l = vec_msums(z34l, PW_MF078_F117, zero); \
+ z3h = vec_msums(z34h, PW_MF078_F117, zero); \
+ z4l = vec_msums(z34l, PW_F117_F078, zero); \
+ z4h = vec_msums(z34h, PW_F117_F078, zero); \
+ \
+ tmp47l = vec_mergeh(tmp4, tmp7); \
+ tmp47h = vec_mergel(tmp4, tmp7); \
+ \
+ tmp4l = vec_msums(tmp47l, PW_MF060_MF089, zero); \
+ tmp4h = vec_msums(tmp47h, PW_MF060_MF089, zero); \
+ tmp7l = vec_msums(tmp47l, PW_MF089_F060, zero); \
+ tmp7h = vec_msums(tmp47h, PW_MF089_F060, zero); \
+ \
+ out7l = vec_add(z3l, tmp4l); \
+ out7h = vec_add(z3h, tmp4h); \
+ out1l = vec_add(z4l, tmp7l); \
+ out1h = vec_add(z4h, tmp7h); \
+ \
+ out7l = vec_add(out7l, PD_DESCALE_P##PASS); \
+ out7h = vec_add(out7h, PD_DESCALE_P##PASS); \
+ out7l = vec_sr(out7l, DESCALE_P##PASS); \
+ out7h = vec_sr(out7h, DESCALE_P##PASS); \
+ \
+ out1l = vec_add(out1l, PD_DESCALE_P##PASS); \
+ out1h = vec_add(out1h, PD_DESCALE_P##PASS); \
+ out1l = vec_sr(out1l, DESCALE_P##PASS); \
+ out1h = vec_sr(out1h, DESCALE_P##PASS); \
+ \
+ out7 = vec_pack(out7l, out7h); \
+ out1 = vec_pack(out1l, out1h); \
+ \
+ tmp56l = vec_mergeh(tmp5, tmp6); \
+ tmp56h = vec_mergel(tmp5, tmp6); \
+ \
+ tmp5l = vec_msums(tmp56l, PW_MF050_MF256, zero); \
+ tmp5h = vec_msums(tmp56h, PW_MF050_MF256, zero); \
+ tmp6l = vec_msums(tmp56l, PW_MF256_F050, zero); \
+ tmp6h = vec_msums(tmp56h, PW_MF256_F050, zero); \
+ \
+ out5l = vec_add(tmp5l, z4l); \
+ out5h = vec_add(tmp5h, z4h); \
+ out3l = vec_add(tmp6l, z3l); \
+ out3h = vec_add(tmp6h, z3h); \
+ \
+ out5l = vec_add(out5l, PD_DESCALE_P##PASS); \
+ out5h = vec_add(out5h, PD_DESCALE_P##PASS); \
+ out5l = vec_sr(out5l, DESCALE_P##PASS); \
+ out5h = vec_sr(out5h, DESCALE_P##PASS); \
+ \
+ out3l = vec_add(out3l, PD_DESCALE_P##PASS); \
+ out3h = vec_add(out3h, PD_DESCALE_P##PASS); \
+ out3l = vec_sr(out3l, DESCALE_P##PASS); \
+ out3h = vec_sr(out3h, DESCALE_P##PASS); \
+ \
+ out5 = vec_pack(out5l, out5h); \
+ out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_ISLOW_ROWS() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_sl(out0, PASS1_BITS); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_sl(out4, PASS1_BITS); \
+ \
+ DO_FDCT_ISLOW_COMMON(1); \
+}
+
+#define DO_FDCT_ISLOW_COLS() \
+{ \
+ /* Even part */ \
+ \
+ tmp10 = vec_add(tmp0, tmp3); \
+ tmp13 = vec_sub(tmp0, tmp3); \
+ tmp11 = vec_add(tmp1, tmp2); \
+ tmp12 = vec_sub(tmp1, tmp2); \
+ \
+ out0 = vec_add(tmp10, tmp11); \
+ out0 = vec_add(out0, PW_DESCALE_P2X); \
+ out0 = vec_sra(out0, PASS1_BITS); \
+ out4 = vec_sub(tmp10, tmp11); \
+ out4 = vec_add(out4, PW_DESCALE_P2X); \
+ out4 = vec_sra(out4, PASS1_BITS); \
+ \
+ DO_FDCT_ISLOW_COMMON(2); \
+}
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+ __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+ col0, col1, col2, col3, col4, col5, col6, col7,
+ tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+ tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+ z3, z4, z34l, z34h,
+ out0, out1, out2, out3, out4, out5, out6, out7;
+ __vector int tmp4l, tmp4h, tmp5l, tmp5h, tmp6l, tmp6h, tmp7l, tmp7h,
+ z3l, z3h, z4l, z4h,
+ out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+ out7l, out7h;
+
+ __vector short PW_F130_F054 = {F_0_541 + F_0_765, F_0_541,
+ F_0_541 + F_0_765, F_0_541, F_0_541 + F_0_765, F_0_541,
+ F_0_541 + F_0_765, F_0_541};
+ __vector short PW_F054_MF130 = {F_0_541, F_0_541 - F_1_847,
+ F_0_541, F_0_541 - F_1_847, F_0_541, F_0_541 - F_1_847,
+ F_0_541, F_0_541 - F_1_847};
+ __vector short PW_MF078_F117 = {F_1_175 - F_1_961, F_1_175,
+ F_1_175 - F_1_961, F_1_175, F_1_175 - F_1_961, F_1_175,
+ F_1_175 - F_1_961, F_1_175};
+ __vector short PW_F117_F078 = {F_1_175, F_1_175 - F_0_390,
+ F_1_175, F_1_175 - F_0_390, F_1_175, F_1_175 - F_0_390,
+ F_1_175, F_1_175 - F_0_390};
+ __vector short PW_MF060_MF089 = {F_0_298 - F_0_899, -F_0_899,
+ F_0_298 - F_0_899, -F_0_899, F_0_298 - F_0_899, -F_0_899,
+ F_0_298 - F_0_899, -F_0_899};
+ __vector short PW_MF089_F060 = {-F_0_899, F_1_501 - F_0_899,
+ -F_0_899, F_1_501 - F_0_899, -F_0_899, F_1_501 - F_0_899,
+ -F_0_899, F_1_501 - F_0_899};
+ __vector short PW_MF050_MF256 = {F_2_053 - F_2_562, -F_2_562,
+ F_2_053 - F_2_562, -F_2_562, F_2_053 - F_2_562, -F_2_562,
+ F_2_053 - F_2_562, -F_2_562};
+ __vector short PW_MF256_F050 = {-F_2_562, F_3_072 - F_2_562,
+ -F_2_562, F_3_072 - F_2_562, -F_2_562, F_3_072 - F_2_562,
+ -F_2_562, F_3_072 - F_2_562};
+ __vector short PW_DESCALE_P2X = vec_splat(jconst_fdct_islow2, 0);
+
+ /* Constants */
+ __vector unsigned short PASS1_BITS = vec_splat_u16(ISLOW_PASS1_BITS);
+ __vector int zero = vec_splat_s32(0),
+ PD_DESCALE_P1 = vec_splat(jconst_fdct_islow, 0),
+ PD_DESCALE_P2 = vec_splat(jconst_fdct_islow, 1);
+ __vector unsigned int DESCALE_P1 = vec_splat_u32(ISLOW_DESCALE_P1),
+ DESCALE_P2 = vec_splat_u32(ISLOW_DESCALE_P2);
/* Pass 1: process rows. */
tmp3 = vec_add(col3, col4);
tmp4 = vec_sub(col3, col4);
- DO_DCT();
+ DO_FDCT_ISLOW_ROWS();
/* Pass 2: process columns. */
tmp3 = vec_add(row3, row4);
tmp4 = vec_sub(row3, row4);
- DO_DCT();
+ DO_FDCT_ISLOW_COLS();
*(__vector short *)&data[0] = out0;
*(__vector short *)&data[8] = out1;