Add AltiVec implementation ofx264_hpel_filter. Provides a 10-11% overall speed-up...

author Guillaume Poirier <gpoirier@mplayerhq.hu>

Sun, 18 Nov 2007 23:47:41 +0000 (23:47 +0000)

committer Guillaume Poirier <gpoirier@mplayerhq.hu>

Sun, 18 Nov 2007 23:47:41 +0000 (23:47 +0000)
author Guillaume Poirier <gpoirier@mplayerhq.hu>
Sun, 18 Nov 2007 23:47:41 +0000 (23:47 +0000)
committer Guillaume Poirier <gpoirier@mplayerhq.hu>
Sun, 18 Nov 2007 23:47:41 +0000 (23:47 +0000)
diff --git a/common/ppc/mc.c b/common/ppc/mc.c

index 96245174f0c5e296558739acc9f4fd340b10b697..bd6c514a33fba7dd03483550d01dacf76bab147f 100644 (file)
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -402,9 +402,231 @@ static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
      }
  }
  
+#define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
+{                                                     \
+    t1v = vec_add( t1v, t6v );                        \
+    t2v = vec_add( t2v, t5v );                        \
+    t3v = vec_add( t3v, t4v );                        \
+                                                      \
+    t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
+    t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
+    t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
+    t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
+    t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
+    t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
+}
+
+#define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
+{                                                     \
+    t1v = vec_add( t1v, t6v );                        \
+    t2v = vec_add( t2v, t5v );                        \
+    t3v = vec_add( t3v, t4v );                        \
+                                                      \
+    t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
+    t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
+    t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
+    t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
+    t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
+    t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
+}
+
+#define HPEL_FILTER_HORIZONTAL()                            \
+{                                                           \
+    VEC_LOAD( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t ); \
+    VEC_LOAD( &src[x+14+i_stride*y], src6v, 16, vec_u8_t ); \
+                                                            \
+    src2v = vec_sld( src1v, src6v,  1 );                    \
+    src3v = vec_sld( src1v, src6v,  2 );                    \
+    src4v = vec_sld( src1v, src6v,  3 );                    \
+    src5v = vec_sld( src1v, src6v,  4 );                    \
+    src6v = vec_sld( src1v, src6v,  5 );                    \
+                                                            \
+    temp1v = vec_u8_to_s16_h( src1v );                      \
+    temp2v = vec_u8_to_s16_h( src2v );                      \
+    temp3v = vec_u8_to_s16_h( src3v );                      \
+    temp4v = vec_u8_to_s16_h( src4v );                      \
+    temp5v = vec_u8_to_s16_h( src5v );                      \
+    temp6v = vec_u8_to_s16_h( src6v );                      \
+                                                            \
+    HPEL_FILTER_1( temp1v, temp2v, temp3v,                  \
+                   temp4v, temp5v, temp6v );                \
+                                                            \
+    dest1v = vec_add( temp1v, sixteenv );                   \
+    dest1v = vec_sra( dest1v, fivev );                      \
+                                                            \
+    temp1v = vec_u8_to_s16_l( src1v );                      \
+    temp2v = vec_u8_to_s16_l( src2v );                      \
+    temp3v = vec_u8_to_s16_l( src3v );                      \
+    temp4v = vec_u8_to_s16_l( src4v );                      \
+    temp5v = vec_u8_to_s16_l( src5v );                      \
+    temp6v = vec_u8_to_s16_l( src6v );                      \
+                                                            \
+    HPEL_FILTER_1( temp1v, temp2v, temp3v,                  \
+                   temp4v, temp5v, temp6v );                \
+                                                            \
+    dest2v = vec_add( temp1v, sixteenv );                   \
+    dest2v = vec_sra( dest2v, fivev );                      \
+                                                            \
+    destv = vec_packsu( dest1v, dest2v );                   \
+                                                            \
+    VEC_STORE16( destv, &dsth[x+i_stride*y] );              \
+}
+
+#define HPEL_FILTER_VERTICAL()                               \
+{                                                            \
+    VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t ); \
+    VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t ); \
+    VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t ); \
+    VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t ); \
+    VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t ); \
+    VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t ); \
+                                                             \
+    temp1v = vec_u8_to_s16_h( src1v );                       \
+    temp2v = vec_u8_to_s16_h( src2v );                       \
+    temp3v = vec_u8_to_s16_h( src3v );                       \
+    temp4v = vec_u8_to_s16_h( src4v );                       \
+    temp5v = vec_u8_to_s16_h( src5v );                       \
+    temp6v = vec_u8_to_s16_h( src6v );                       \
+                                                             \
+    HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
+                   temp4v, temp5v, temp6v );                 \
+                                                             \
+    dest1v = vec_add( temp1v, sixteenv );                    \
+    dest1v = vec_sra( dest1v, fivev );                       \
+                                                             \
+    temp4v = vec_u8_to_s16_l( src1v );                       \
+    temp5v = vec_u8_to_s16_l( src2v );                       \
+    temp6v = vec_u8_to_s16_l( src3v );                       \
+    temp7v = vec_u8_to_s16_l( src4v );                       \
+    temp8v = vec_u8_to_s16_l( src5v );                       \
+    temp9v = vec_u8_to_s16_l( src6v );                       \
+                                                             \
+    HPEL_FILTER_1( temp4v, temp5v, temp6v,                   \
+                   temp7v, temp8v, temp9v );                 \
+                                                             \
+    dest2v = vec_add( temp4v, sixteenv );                    \
+    dest2v = vec_sra( dest2v, fivev );                       \
+                                                             \
+    destv = vec_packsu( dest1v, dest2v );                    \
+                                                             \
+    VEC_STORE16( destv, &dstv[x+i_stride*y] );               \
+}
+
+#define HPEL_FILTER_CENTRAL()                     \
+{                                                 \
+    temp1v = vec_sld( tempav, tempbv, 12 );       \
+    temp2v = vec_sld( tempav, tempbv, 14 );       \
+    temp3v = tempbv;                              \
+    temp4v = vec_sld( tempbv, tempcv,  2 );       \
+    temp5v = vec_sld( tempbv, tempcv,  4 );       \
+    temp6v = vec_sld( tempbv, tempcv,  6 );       \
+                                                  \
+    HPEL_FILTER_2( temp1v, temp2v, temp3v,        \
+                   temp4v, temp5v, temp6v );      \
+                                                  \
+    dest1v = vec_add( temp1v, thirtytwov );       \
+    dest1v = vec_sra( dest1v, sixv );             \
+                                                  \
+    temp1v = vec_sld( tempbv, tempcv, 12 );       \
+    temp2v = vec_sld( tempbv, tempcv, 14 );       \
+    temp3v = tempcv;                              \
+    temp4v = vec_sld( tempcv, tempdv,  2 );       \
+    temp5v = vec_sld( tempcv, tempdv,  4 );       \
+    temp6v = vec_sld( tempcv, tempdv,  6 );       \
+                                                  \
+    HPEL_FILTER_2( temp1v, temp2v, temp3v,        \
+                   temp4v, temp5v, temp6v );      \
+                                                  \
+    dest2v = vec_add( temp1v, thirtytwov );       \
+    dest2v = vec_sra( dest2v, sixv );             \
+                                                  \
+    destv = vec_packsu( dest1v, dest2v );         \
+                                                  \
+    VEC_STORE16( destv, &dstc[x-16+i_stride*y] ); \
+}
+
+void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+                               int i_stride, int i_width, int i_height )
+{
+    int x, y;
+
+    vec_u8_t destv;
+    vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
+    vec_s16_t dest1v, dest2v;
+    vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
+    vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
+
+    PREP_LOAD;
+    PREP_STORE16;
+    LOAD_ZERO;
+
+    vec_u16_t twov, fourv, fivev, sixv;
+    vec_s16_t sixteenv, thirtytwov;
+    vect_ushort_u temp_u;
+
+    temp_u.s[0]=2;
+    twov = vec_splat( temp_u.v, 0 );
+    temp_u.s[0]=4;
+    fourv = vec_splat( temp_u.v, 0 );
+    temp_u.s[0]=5;
+    fivev = vec_splat( temp_u.v, 0 );
+    temp_u.s[0]=6;
+    sixv = vec_splat( temp_u.v, 0 );
+    temp_u.s[0]=16;
+    sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
+    temp_u.s[0]=32;
+    thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
+
+    for( y = 0; y < i_height; y++ )
+    {
+        x = 0;
+
+        /* horizontal_filter */
+        HPEL_FILTER_HORIZONTAL();
+
+        /* vertical_filter */
+        HPEL_FILTER_VERTICAL();
+
+        /* central_filter */
+        tempav = tempcv;
+        tempbv = tempdv;
+        tempcv = vec_splat( temp1v, 0 ); /* first only */
+        tempdv = temp1v;
+        tempev = temp4v;
+
+        for( x = 16; x < i_width; x+=16 )
+        {
+            /* horizontal_filter */
+            HPEL_FILTER_HORIZONTAL();
+
+            /* vertical_filter */
+            HPEL_FILTER_VERTICAL();
+
+            /* central_filter */
+            tempav = tempcv;
+            tempbv = tempdv;
+            tempcv = tempev;
+            tempdv = temp1v;
+            tempev = temp4v;
+
+            HPEL_FILTER_CENTRAL();
+        }
+
+        /* central_filter */
+        tempav = tempcv;
+        tempbv = tempdv;
+        tempcv = tempev;
+        tempdv = vec_splat( tempcv, 7 ); /* last only */
+
+        HPEL_FILTER_CENTRAL();
+    }
+}
+
  void x264_mc_altivec_init( x264_mc_functions_t *pf )
  {
      pf->mc_luma   = mc_luma_altivec;
      pf->get_ref   = get_ref_altivec;
      pf->mc_chroma = mc_chroma_altivec;
+
+    pf->hpel_filter = x264_hpel_filter_altivec;
  }
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h

index 7768f8ae31f8de9be95c6600f020503470a04fb3..a4cb9e36a1f018e9b694912942ea04ab404fa5a5 100644 (file)
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -40,6 +40,16 @@
  #define vec_u32_t vector unsigned int
  #define vec_s32_t vector signed int
  
+typedef union {
+  unsigned int s[4];
+  vector unsigned int v;
+} vect_int_u;
+
+typedef union {
+  unsigned short s[8];
+  vector unsigned short v;
+} vect_ushort_u;
+
  /***********************************************************************
   * Null vector
   **********************************************************************/
diff --git a/common/ppc/quant.c b/common/ppc/quant.c

index 3036d8106abbe4112eff516e344dce383b0545a7..72608afdfa683b3165205d4fee6192487d86f411 100644 (file)
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -22,16 +22,6 @@
  #include <altivec.h>
  #endif
  
-typedef union {
-  unsigned int s[4];
-  vector unsigned int v;
-} vect_int_u;
-
-typedef union {
-  unsigned short s[8];
-  vector unsigned short v;
-} vect_ushort_u;
-
  #include "common/common.h"
  #include "ppccommon.h"
  #include "quant.h"
author	Guillaume Poirier <gpoirier@mplayerhq.hu>
	Sun, 18 Nov 2007 23:47:41 +0000 (23:47 +0000)
committer	Guillaume Poirier <gpoirier@mplayerhq.hu>
	Sun, 18 Nov 2007 23:47:41 +0000 (23:47 +0000)
common/ppc/mc.c		patch \| blob \| history
common/ppc/ppccommon.h		patch \| blob \| history
common/ppc/quant.c		patch \| blob \| history