AltiVec SIMD implementation of slow integer forward DCT; Clean up fast integer forwa...

author DRC <dcommander@users.sourceforge.net>

Wed, 17 Dec 2014 08:04:39 +0000 (08:04 +0000)

committer DRC <dcommander@users.sourceforge.net>

Wed, 17 Dec 2014 08:04:39 +0000 (08:04 +0000)
author DRC <dcommander@users.sourceforge.net>
Wed, 17 Dec 2014 08:04:39 +0000 (08:04 +0000)
committer DRC <dcommander@users.sourceforge.net>
Wed, 17 Dec 2014 08:04:39 +0000 (08:04 +0000)
diff --git a/simd/jsimd.h b/simd/jsimd.h

index b03297289e1305960a4914c910282308d48628d5..4dcdfc1af0f61d45bb939ac8eea81d2b40f58cca 100644 (file)
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -545,6 +545,8 @@ EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM * data);
  
  EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM * data);
  
+EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM * data);
+
  /* Fast Integer Forward DCT */
  EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM * data);
  
diff --git a/simd/jsimd_powerpc.c b/simd/jsimd_powerpc.c

index a9a5965d09dac899a0596bfbf811b0f29d2e342a..ff37c5f00b563fc1f8061024722ea249c50fca7a 100644 (file)
--- a/simd/jsimd_powerpc.c
+++ b/simd/jsimd_powerpc.c
@@ -226,6 +226,17 @@ jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
  GLOBAL(int)
  jsimd_can_fdct_islow (void)
  {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
    return 0;
  }
  
@@ -255,6 +266,7 @@ jsimd_can_fdct_float (void)
  GLOBAL(void)
  jsimd_fdct_islow (DCTELEM * data)
  {
+  jsimd_fdct_islow_altivec(data);
  }
  
  GLOBAL(void)
diff --git a/simd/jsimd_powerpc_altivec.c b/simd/jsimd_powerpc_altivec.c

index e18eaa8e4008266b5755628d1a0d1b1eb8488c8f..ef32545d7e259caac8807707808c0e3da81572c8 100644 (file)
--- a/simd/jsimd_powerpc_altivec.c
+++ b/simd/jsimd_powerpc_altivec.c
@@ -29,6 +29,9 @@
  #include "jsimd.h"
  #include <altivec.h>
  
+
+/* Common code */
+
  #define TRANSPOSE(row, col)  \
  {  \
    __vector short row04l, row04h, row15l, row15h,  \
@@ -67,15 +70,30 @@
    col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
  }
  
-static const __vector short constants __attribute__((aligned(16))) =
+
+/* FAST INTEGER FORWARD DCT
+ *
+ * This is similar to the SSE2 implementation, except that we left-shift the
+ * constants by 1 less bit (the -1 in IFAST_CONST_SHIFT.)  This is because
+ * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
+ *   the elements in arg3 + the most significant 17 bits of
+ *     (the elements in arg1 * the elements in arg2).
+ */
+
+#define IFAST_CONST_BITS 8
+#define IFAST_PRE_MULTIPLY_SCALE_BITS 2
+#define IFAST_CONST_SHIFT \
+  (16 - IFAST_PRE_MULTIPLY_SCALE_BITS - IFAST_CONST_BITS - 1)
+
+static const __vector short jconst_fdct_ifast __attribute__((aligned(16))) =
  {
-  98 << 5,   /* FIX(0.382683433) */
-  139 << 5,  /* FIX(0.541196100) */
-  181 << 5,  /* FIX(0.707106781) */
-  334 << 5   /* FIX(1.306562965) */
+  98 << IFAST_CONST_SHIFT,   /* FIX(0.382683433) */
+  139 << IFAST_CONST_SHIFT,  /* FIX(0.541196100) */
+  181 << IFAST_CONST_SHIFT,  /* FIX(0.707106781) */
+  334 << IFAST_CONST_SHIFT   /* FIX(1.306562965) */
  };
  
-#define DO_DCT()  \
+#define DO_FDCT_IFAST()  \
  {  \
    /* Even part */  \
    \
@@ -134,11 +152,266 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
  
    /* Constants */
    __vector short zero = vec_splat_s16(0),
-    PW_0382 = vec_splat(constants, 0),
-    PW_0541 = vec_splat(constants, 1),
-    PW_0707 = vec_splat(constants, 2),
-    PW_1306 = vec_splat(constants, 3);
-  __vector unsigned short PRE_MULTIPLY_SCALE_BITS = vec_splat_u16(2);
+    PW_0382 = vec_splat(jconst_fdct_ifast, 0),
+    PW_0541 = vec_splat(jconst_fdct_ifast, 1),
+    PW_0707 = vec_splat(jconst_fdct_ifast, 2),
+    PW_1306 = vec_splat(jconst_fdct_ifast, 3);
+  __vector unsigned short PRE_MULTIPLY_SCALE_BITS =
+    vec_splat_u16(IFAST_PRE_MULTIPLY_SCALE_BITS);
+
+  /* Pass 1: process rows. */
+
+  row0 = *(__vector short *)&data[0];
+  row1 = *(__vector short *)&data[8];
+  row2 = *(__vector short *)&data[16];
+  row3 = *(__vector short *)&data[24];
+  row4 = *(__vector short *)&data[32];
+  row5 = *(__vector short *)&data[40];
+  row6 = *(__vector short *)&data[48];
+  row7 = *(__vector short *)&data[56];
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_IFAST();
+
+  /* Pass 2: process columns. */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_IFAST();
+
+  *(__vector short *)&data[0] = out0;
+  *(__vector short *)&data[8] = out1;
+  *(__vector short *)&data[16] = out2;
+  *(__vector short *)&data[24] = out3;
+  *(__vector short *)&data[32] = out4;
+  *(__vector short *)&data[40] = out5;
+  *(__vector short *)&data[48] = out6;
+  *(__vector short *)&data[56] = out7;
+}
+
+
+/* SLOW INTEGER FORWARD DCT */
+
+#define F_0_298 2446   /* FIX(0.298631336) */
+#define F_0_390 3196   /* FIX(0.390180644) */
+#define F_0_541 4433   /* FIX(0.541196100) */
+#define F_0_765 6270   /* FIX(0.765366865) */
+#define F_0_899 7373   /* FIX(0.899976223) */
+#define F_1_175 9633   /* FIX(1.175875602) */
+#define F_1_501 12299  /* FIX(1.501321110) */
+#define F_1_847 15137  /* FIX(1.847759065) */
+#define F_1_961 16069  /* FIX(1.961570560) */
+#define F_2_053 16819  /* FIX(2.053119869) */
+#define F_2_562 20995  /* FIX(2.562915447) */
+#define F_3_072 25172  /* FIX(3.072711026) */
+
+#define ISLOW_CONST_BITS 13
+#define ISLOW_PASS1_BITS 2
+#define ISLOW_DESCALE_P1 (ISLOW_CONST_BITS - ISLOW_PASS1_BITS)
+#define ISLOW_DESCALE_P2 (ISLOW_CONST_BITS + ISLOW_PASS1_BITS)
+
+static const __vector int jconst_fdct_islow __attribute__((aligned(16))) =
+{
+  1 << (ISLOW_DESCALE_P1 - 1),
+  1 << (ISLOW_DESCALE_P2 - 1)
+};
+
+static const __vector short jconst_fdct_islow2 __attribute__((aligned(16))) =
+{
+  1 << (ISLOW_PASS1_BITS - 1)
+};
+
+#define DO_FDCT_ISLOW_COMMON(PASS)  \
+{  \
+  tmp1312l = vec_mergeh(tmp13, tmp12);  \
+  tmp1312h = vec_mergel(tmp13, tmp12);  \
+  \
+  out2l = vec_msums(tmp1312l, PW_F130_F054, zero);  \
+  out2h = vec_msums(tmp1312h, PW_F130_F054, zero);  \
+  out6l = vec_msums(tmp1312l, PW_F054_MF130, zero);  \
+  out6h = vec_msums(tmp1312h, PW_F054_MF130, zero);  \
+  \
+  out2l = vec_add(out2l, PD_DESCALE_P##PASS);  \
+  out2h = vec_add(out2h, PD_DESCALE_P##PASS);  \
+  out2l = vec_sr(out2l, DESCALE_P##PASS);  \
+  out2h = vec_sr(out2h, DESCALE_P##PASS);  \
+  \
+  out6l = vec_add(out6l, PD_DESCALE_P##PASS);  \
+  out6h = vec_add(out6h, PD_DESCALE_P##PASS);  \
+  out6l = vec_sr(out6l, DESCALE_P##PASS);  \
+  out6h = vec_sr(out6h, DESCALE_P##PASS);  \
+  \
+  out2 = vec_pack(out2l, out2h);  \
+  out6 = vec_pack(out6l, out6h);  \
+  \
+  /* Odd part */  \
+  \
+  z3 = vec_add(tmp4, tmp6);  \
+  z4 = vec_add(tmp5, tmp7);  \
+  \
+  z34l = vec_mergeh(z3, z4);  \
+  z34h = vec_mergel(z3, z4);  \
+  \
+  z3l = vec_msums(z34l, PW_MF078_F117, zero);  \
+  z3h = vec_msums(z34h, PW_MF078_F117, zero);  \
+  z4l = vec_msums(z34l, PW_F117_F078, zero);  \
+  z4h = vec_msums(z34h, PW_F117_F078, zero);  \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7);  \
+  tmp47h = vec_mergel(tmp4, tmp7);  \
+  \
+  tmp4l = vec_msums(tmp47l, PW_MF060_MF089, zero);  \
+  tmp4h = vec_msums(tmp47h, PW_MF060_MF089, zero);  \
+  tmp7l = vec_msums(tmp47l, PW_MF089_F060, zero);  \
+  tmp7h = vec_msums(tmp47h, PW_MF089_F060, zero);  \
+  \
+  out7l = vec_add(z3l, tmp4l);  \
+  out7h = vec_add(z3h, tmp4h);  \
+  out1l = vec_add(z4l, tmp7l);  \
+  out1h = vec_add(z4h, tmp7h);  \
+  \
+  out7l = vec_add(out7l, PD_DESCALE_P##PASS);  \
+  out7h = vec_add(out7h, PD_DESCALE_P##PASS);  \
+  out7l = vec_sr(out7l, DESCALE_P##PASS);  \
+  out7h = vec_sr(out7h, DESCALE_P##PASS);  \
+  \
+  out1l = vec_add(out1l, PD_DESCALE_P##PASS);  \
+  out1h = vec_add(out1h, PD_DESCALE_P##PASS);  \
+  out1l = vec_sr(out1l, DESCALE_P##PASS);  \
+  out1h = vec_sr(out1h, DESCALE_P##PASS);  \
+  \
+  out7 = vec_pack(out7l, out7h);  \
+  out1 = vec_pack(out1l, out1h);  \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6);  \
+  tmp56h = vec_mergel(tmp5, tmp6);  \
+  \
+  tmp5l = vec_msums(tmp56l, PW_MF050_MF256, zero);  \
+  tmp5h = vec_msums(tmp56h, PW_MF050_MF256, zero);  \
+  tmp6l = vec_msums(tmp56l, PW_MF256_F050, zero);  \
+  tmp6h = vec_msums(tmp56h, PW_MF256_F050, zero);  \
+  \
+  out5l = vec_add(tmp5l, z4l);  \
+  out5h = vec_add(tmp5h, z4h);  \
+  out3l = vec_add(tmp6l, z3l);  \
+  out3h = vec_add(tmp6h, z3h);  \
+  \
+  out5l = vec_add(out5l, PD_DESCALE_P##PASS);  \
+  out5h = vec_add(out5h, PD_DESCALE_P##PASS);  \
+  out5l = vec_sr(out5l, DESCALE_P##PASS);  \
+  out5h = vec_sr(out5h, DESCALE_P##PASS);  \
+  \
+  out3l = vec_add(out3l, PD_DESCALE_P##PASS);  \
+  out3h = vec_add(out3h, PD_DESCALE_P##PASS);  \
+  out3l = vec_sr(out3l, DESCALE_P##PASS);  \
+  out3h = vec_sr(out3h, DESCALE_P##PASS);  \
+  \
+  out5 = vec_pack(out5l, out5h);  \
+  out3 = vec_pack(out3l, out3h);  \
+}
+
+#define DO_FDCT_ISLOW_ROWS()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_sl(out0, PASS1_BITS);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_sl(out4, PASS1_BITS);  \
+  \
+  DO_FDCT_ISLOW_COMMON(1);  \
+}
+
+#define DO_FDCT_ISLOW_COLS()  \
+{  \
+  /* Even part */  \
+  \
+  tmp10 = vec_add(tmp0, tmp3);  \
+  tmp13 = vec_sub(tmp0, tmp3);  \
+  tmp11 = vec_add(tmp1, tmp2);  \
+  tmp12 = vec_sub(tmp1, tmp2);  \
+  \
+  out0  = vec_add(tmp10, tmp11);  \
+  out0  = vec_add(out0, PW_DESCALE_P2X);  \
+  out0  = vec_sra(out0, PASS1_BITS);  \
+  out4  = vec_sub(tmp10, tmp11);  \
+  out4  = vec_add(out4, PW_DESCALE_P2X);  \
+  out4  = vec_sra(out4, PASS1_BITS);  \
+  \
+  DO_FDCT_ISLOW_COMMON(2);  \
+}
+
+void
+jsimd_fdct_islow_altivec (DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int tmp4l, tmp4h, tmp5l, tmp5h, tmp6l, tmp6h, tmp7l, tmp7h,
+    z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  __vector short PW_F130_F054 = {F_0_541 + F_0_765, F_0_541,
+    F_0_541 + F_0_765, F_0_541, F_0_541 + F_0_765, F_0_541,
+    F_0_541 + F_0_765, F_0_541};
+  __vector short PW_F054_MF130 = {F_0_541, F_0_541 - F_1_847,
+    F_0_541, F_0_541 - F_1_847, F_0_541, F_0_541 - F_1_847,
+    F_0_541, F_0_541 - F_1_847};
+  __vector short PW_MF078_F117 = {F_1_175 - F_1_961, F_1_175,
+    F_1_175 - F_1_961, F_1_175, F_1_175 - F_1_961, F_1_175,
+    F_1_175 - F_1_961, F_1_175};
+  __vector short PW_F117_F078 = {F_1_175, F_1_175 - F_0_390,
+    F_1_175, F_1_175 - F_0_390, F_1_175, F_1_175 - F_0_390,
+    F_1_175, F_1_175 - F_0_390};
+  __vector short PW_MF060_MF089 = {F_0_298 - F_0_899, -F_0_899,
+    F_0_298 - F_0_899, -F_0_899, F_0_298 - F_0_899, -F_0_899,
+    F_0_298 - F_0_899, -F_0_899};
+  __vector short PW_MF089_F060 = {-F_0_899, F_1_501 - F_0_899,
+    -F_0_899, F_1_501 - F_0_899, -F_0_899, F_1_501 - F_0_899,
+    -F_0_899, F_1_501 - F_0_899};
+  __vector short PW_MF050_MF256 = {F_2_053 - F_2_562, -F_2_562,
+    F_2_053 - F_2_562, -F_2_562, F_2_053 - F_2_562, -F_2_562,
+    F_2_053 - F_2_562, -F_2_562};
+  __vector short PW_MF256_F050 = {-F_2_562, F_3_072 - F_2_562,
+    -F_2_562, F_3_072 - F_2_562, -F_2_562, F_3_072 - F_2_562,
+    -F_2_562, F_3_072 - F_2_562};
+  __vector short PW_DESCALE_P2X = vec_splat(jconst_fdct_islow2, 0);
+
+  /* Constants */
+  __vector unsigned short PASS1_BITS = vec_splat_u16(ISLOW_PASS1_BITS);
+  __vector int zero = vec_splat_s32(0),
+    PD_DESCALE_P1 = vec_splat(jconst_fdct_islow, 0),
+    PD_DESCALE_P2 = vec_splat(jconst_fdct_islow, 1);
+  __vector unsigned int DESCALE_P1 = vec_splat_u32(ISLOW_DESCALE_P1),
+    DESCALE_P2 = vec_splat_u32(ISLOW_DESCALE_P2);
  
    /* Pass 1: process rows. */
  
@@ -162,7 +435,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
    tmp3 = vec_add(col3, col4);
    tmp4 = vec_sub(col3, col4);
  
-  DO_DCT();
+  DO_FDCT_ISLOW_ROWS();
  
    /* Pass 2: process columns. */
  
@@ -177,7 +450,7 @@ jsimd_fdct_ifast_altivec (DCTELEM *data)
    tmp3 = vec_add(row3, row4);
    tmp4 = vec_sub(row3, row4);
  
-  DO_DCT();
+  DO_FDCT_ISLOW_COLS();
  
    *(__vector short *)&data[0] = out0;
    *(__vector short *)&data[8] = out1;
author	DRC <dcommander@users.sourceforge.net>
	Wed, 17 Dec 2014 08:04:39 +0000 (08:04 +0000)
committer	DRC <dcommander@users.sourceforge.net>
	Wed, 17 Dec 2014 08:04:39 +0000 (08:04 +0000)
simd/jsimd.h		patch \| blob \| history
simd/jsimd_powerpc.c		patch \| blob \| history
simd/jsimd_powerpc_altivec.c		patch \| blob \| history