From: Guillaume Poirier <gpoirier@mplayerhq.hu>
Date: Mon, 12 Nov 2007 20:36:33 +0000 (+0000)
Subject: Add AltiVec implementation of add4x4_idct, add8x8_idct, add16x16_idct, 3.2x faster... 
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a8650641e1006d9750cc97d3c1672871c4549296;p=libx264

Add AltiVec implementation of add4x4_idct, add8x8_idct, add16x16_idct, 3.2x faster on average
1.05x faster overall with default encoding options
Patch by Noboru Asai % noboru DD asai AA gmail DD com %


git-svn-id: svn://svn.videolan.org/x264/trunk@685 df754926-b1dd-0310-bc7b-ec298dee348c
---

diff --git a/common/dct.c b/common/dct.c
index 15bcfd04..0e724e42 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -440,6 +440,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 
+        dctf->add4x4_idct   = x264_add4x4_idct_altivec;
+        dctf->add8x8_idct   = x264_add8x8_idct_altivec;
+        dctf->add16x16_idct = x264_add16x16_idct_altivec;
+
         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 
diff --git a/common/ppc/dct.c b/common/ppc/dct.c
index 87aab07d..87685305 100644
--- a/common/ppc/dct.c
+++ b/common/ppc/dct.c
@@ -233,6 +233,99 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *p
  * IDCT transform:
  ****************************************************************************/
 
+#define IDCT_1D_ALTIVEC(s0, s1, s2, s3,  d0, d1, d2, d3) \
+{                                                        \
+    /*        a0  = SRC(0) + SRC(2); */                  \
+    vec_s16_t a0v = vec_add(s0, s2);                     \
+    /*        a1  = SRC(0) - SRC(2); */                  \
+    vec_s16_t a1v = vec_sub(s0, s2);                     \
+    /*        a2  =           (SRC(1)>>1) - SRC(3); */   \
+    vec_s16_t a2v = vec_sub(vec_sra(s1, onev), s3);      \
+    /*        a3  =           (SRC(3)>>1) + SRC(1); */   \
+    vec_s16_t a3v = vec_add(vec_sra(s3, onev), s1);      \
+    /* DST(0,    a0 + a3); */                            \
+    d0 = vec_add(a0v, a3v);                              \
+    /* DST(1,    a1 + a2); */                            \
+    d1 = vec_add(a1v, a2v);                              \
+    /* DST(2,    a1 - a2); */                            \
+    d2 = vec_sub(a1v, a2v);                              \
+    /* DST(3,    a0 - a3); */                            \
+    d3 = vec_sub(a0v, a3v);                              \
+}
+
+#define VEC_LOAD_U8_ADD_S16_STORE_U8(va)             \
+    vdst_orig = vec_ld(0, dst);                      \
+    vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
+    vdst_ss = (vec_s16_t)vec_mergeh(zero_u8v, vdst); \
+    va = vec_add(va, vdst_ss);                       \
+    va_u8 = vec_s16_to_u8(va);                       \
+    va_u32 = vec_splat((vec_u32_t)va_u8, 0);         \
+    vec_ste(va_u32, element, (uint32_t*)dst);
+
+#define ALTIVEC_STORE4_SUM_CLIP(dest, idctv, perm_ldv)          \
+{                                                               \
+    /* unaligned load */                                        \
+    vec_u8_t lv = vec_ld(0, dest);                              \
+    vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
+    vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                  \
+    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);    \
+    vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);   \
+    vec_u8_t idstsum8 = vec_s16_to_u8(idstsum);                 \
+    /* unaligned store */                                       \
+    vec_u32_t bodyv = vec_splat((vec_u32_t)idstsum8, 0);        \
+    int element = ((unsigned long)dest & 0xf) >> 2;             \
+    vec_ste(bodyv, element, (uint32_t *)dest);                  \
+}
+
+void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[4][4] )
+{
+    vec_u16_t onev = vec_splat_u16(1);
+
+    dct[0][0] += 32; // rounding for the >>6 at the end
+
+    vec_s16_t s0, s1, s2, s3;
+
+    s0 = vec_ld( 0x00, (int16_t*)dct );
+    s1 = vec_sld( s0, s0, 8 );
+    s2 = vec_ld( 0x10, (int16_t*)dct );
+    s3 = vec_sld( s2, s2, 8 );
+
+    vec_s16_t d0, d1, d2, d3;
+    IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 );
+
+    vec_s16_t tr0, tr1, tr2, tr3;
+
+    VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 );
+
+    vec_s16_t idct0, idct1, idct2, idct3;
+    IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 );
+
+    vec_u8_t perm_ldv = vec_lvsl( 0, dst );
+    vec_u16_t sixv = vec_splat_u16(6);
+    LOAD_ZERO;
+
+    ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv );
+    ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv );
+    ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv );
+    ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv );
+}
+
+void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][4][4] )
+{
+    x264_add4x4_idct_altivec( &p_dst[0],               dct[0] );
+    x264_add4x4_idct_altivec( &p_dst[4],               dct[1] );
+    x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2] );
+    x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+4], dct[3] );
+}
+
+void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][4][4] )
+{
+    x264_add8x8_idct_altivec( &p_dst[0],               &dct[0] );
+    x264_add8x8_idct_altivec( &p_dst[8],               &dct[4] );
+    x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
+    x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
+}
+
 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7)\
 {\
     /*        a0  = SRC(0) + SRC(4); */ \
@@ -362,4 +455,3 @@ void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][8][8] )
     x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+0], dct[2] );
     x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+8], dct[3] );
 }
-
diff --git a/common/ppc/dct.h b/common/ppc/dct.h
index 0e0df524..9aa60456 100644
--- a/common/ppc/dct.h
+++ b/common/ppc/dct.h
@@ -32,6 +32,10 @@ void x264_sub8x8_dct_altivec( int16_t dct[4][4][4],
 void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
         uint8_t *pix1, uint8_t *pix2 );
 
+void x264_add4x4_idct_altivec( uint8_t *p_dst, int16_t dct[4][4] );
+void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][4][4] );
+void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][4][4] );
+
 void x264_sub8x8_dct8_altivec( int16_t dct[8][8],
         uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8],
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index e13e9753..7768f8ae 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -64,7 +64,7 @@
 #define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
 
 #define vec_u16_to_u8(v) vec_pack( v, zero_u16v )
-#define vec_s16_to_u8(v) vec_pack( v, zero_u16v )
+#define vec_s16_to_u8(v) vec_packsu( v, zero_s16v )
 
 /***********************************************************************
  * PREP_LOAD: declares two vectors required to perform unaligned loads