From 4d5c7a033fbe7e7b168381a9fa15e8c2eb1a6a2f Mon Sep 17 00:00:00 2001 From: Eric Petit Date: Fri, 13 May 2005 15:30:18 +0000 Subject: [PATCH] ppc/*: more tuning on satd (+5%) git-svn-id: svn://svn.videolan.org/x264/trunk@229 df754926-b1dd-0310-bc7b-ec298dee348c --- common/ppc/pixel.c | 664 +++++++++++++++++++++++++++++++++++------ common/ppc/ppccommon.h | 64 ---- 2 files changed, 571 insertions(+), 157 deletions(-) diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index b56747e3..bc69e5a1 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -36,7 +36,10 @@ #include "pixel.h" #include "ppccommon.h" -/* sad routines */ +/*********************************************************************** + * SAD routines + **********************************************************************/ + #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \ static int name( uint8_t *pix1, int i_pix1, \ uint8_t *pix2, int i_pix2 ) \ @@ -60,7 +63,8 @@ static int name( uint8_t *pix1, int i_pix1, \ pix2 += i_pix2; \ } \ sumv = vec_sum##a( sumv, zero_s32v ); \ - vec_ste( vec_splat( sumv, b ), 0, &sum ); \ + sumv = vec_splat( sumv, b ); \ + vec_ste( sumv, 0, &sum ); \ return sum; \ } @@ -69,52 +73,183 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec, 8, 16, 2s, 1 ) PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec, 16, 8, s, 3 ) PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 ) -/* satd routines */ +/*********************************************************************** + * SATD routines + **********************************************************************/ + +/*********************************************************************** + * VEC_TRANSPOSE_8 + *********************************************************************** + * Transposes a 8x8 matrix of s16 vectors + **********************************************************************/ +#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ + b0 = vec_mergeh( a0, a4 ); \ + b1 = vec_mergel( a0, a4 ); \ + b2 = vec_mergeh( a1, a5 ); \ + b3 = vec_mergel( a1, a5 ); \ + b4 = vec_mergeh( a2, a6 ); \ + b5 = vec_mergel( a2, a6 ); \ + b6 = vec_mergeh( a3, a7 ); \ + b7 = vec_mergel( a3, a7 ); \ + a0 = vec_mergeh( b0, b4 ); \ + a1 = vec_mergel( b0, b4 ); \ + a2 = vec_mergeh( b1, b5 ); \ + a3 = vec_mergel( b1, b5 ); \ + a4 = vec_mergeh( b2, b6 ); \ + a5 = vec_mergel( b2, b6 ); \ + a6 = vec_mergeh( b3, b7 ); \ + a7 = vec_mergel( b3, b7 ); \ + b0 = vec_mergeh( a0, a4 ); \ + b1 = vec_mergel( a0, a4 ); \ + b2 = vec_mergeh( a1, a5 ); \ + b3 = vec_mergel( a1, a5 ); \ + b4 = vec_mergeh( a2, a6 ); \ + b5 = vec_mergel( a2, a6 ); \ + b6 = vec_mergeh( a3, a7 ); \ + b7 = vec_mergel( a3, a7 ) + +/*********************************************************************** + * VEC_TRANSPOSE_4 + *********************************************************************** + * Transposes a 4x4 matrix of s16 vectors + **********************************************************************/ +#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ + b0 = vec_mergeh( a0, a0 ); \ + b1 = vec_mergeh( a1, a0 ); \ + b2 = vec_mergeh( a2, a0 ); \ + b3 = vec_mergeh( a3, a0 ); \ + a0 = vec_mergeh( b0, b2 ); \ + a1 = vec_mergel( b0, b2 ); \ + a2 = vec_mergeh( b1, b3 ); \ + a3 = vec_mergel( b1, b3 ); \ + b0 = vec_mergeh( a0, a2 ); \ + b1 = vec_mergel( a0, a2 ); \ + b2 = vec_mergeh( a1, a3 ); \ + b3 = vec_mergel( a1, a3 ) -#define DO_DIFF(a,b) \ - VEC_LOAD( pix1, pix1v, b ); \ - pix1v = vec_u8_to_s16( pix1v ); \ - VEC_LOAD( pix2, pix2v, b ); \ - pix2v = vec_u8_to_s16( pix2v ); \ - diff##a##v = vec_sub( pix1v, pix2v ); \ - pix1 += i_pix1; \ - pix2 += i_pix2 +/*********************************************************************** + * VEC_HADAMAR + *********************************************************************** + * b[0] = a[0] + a[1] + a[2] + a[3] + * b[1] = a[0] + a[1] - a[2] - a[3] + * b[2] = a[0] - a[1] - a[2] + a[3] + * b[3] = a[0] - a[1] + a[2] - a[3] + **********************************************************************/ +#define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \ + b2 = vec_add( a0, a1 ); \ + b3 = vec_add( a2, a3 ); \ + a0 = vec_sub( a0, a1 ); \ + a2 = vec_sub( a2, a3 ); \ + b0 = vec_add( b2, b3 ); \ + b1 = vec_sub( b2, b3 ); \ + b2 = vec_sub( a0, a2 ); \ + b3 = vec_add( a0, a2 ) + +/*********************************************************************** + * VEC_DIFF_H + *********************************************************************** + * p1, p2: u8 * + * i1, i2, n: int + * d: s16v + * + * Loads n bytes from p1 and p2, do the diff of the high elements into + * d, increments p1 and p2 by i1 and i2 + **********************************************************************/ +#define PREP_DIFF \ + LOAD_ZERO; \ + PREP_LOAD; \ + vec_s16_t pix1v, pix2v; -#define ADD_ABS_TO_SATD(a,b) \ - temp##a##v = vec_abs( temp##a##v ); \ - satdv = vec_sum4s( temp##a##v, b ) +#define VEC_DIFF_H(p1,i1,p2,i2,n,d) \ + VEC_LOAD( p1, pix1v, n ); \ + pix1v = vec_u8_to_s16( pix1v ); \ + VEC_LOAD( p2, pix2v, n ); \ + pix2v = vec_u8_to_s16( pix2v ); \ + d = vec_sub( pix1v, pix2v ); \ + p1 += i1; \ + p2 += i2 -static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +/*********************************************************************** + * VEC_DIFF_HL + *********************************************************************** + * p1, p2: u8 * + * i1, i2: int + * dh, dl: s16v + * + * Loads 16 bytes from p1 and p2, do the diff of the high elements into + * dh, the diff of the low elements into dl, increments p1 and p2 by i1 + * and i2 + **********************************************************************/ +#define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl) \ + VEC_LOAD( p1, pix1v, 16 ); \ + temp0v = vec_u8_to_s16_h( pix1v ); \ + temp1v = vec_u8_to_s16_l( pix1v ); \ + VEC_LOAD( p2, pix2v, 16 ); \ + temp2v = vec_u8_to_s16_h( pix2v ); \ + temp3v = vec_u8_to_s16_l( pix2v ); \ + dh = vec_sub( temp0v, temp2v ); \ + dl = vec_sub( temp1v, temp3v ); \ + p1 += i1; \ + p2 += i2 + +/*********************************************************************** + * VEC_ABS + *********************************************************************** + * a: s16v + * + * a = abs(a) + * + * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs() + * actually also calls vec_splat(0), but we already have a null vector. + **********************************************************************/ +#define VEC_ABS(a) \ + pix1v = vec_sub( zero_s16v, a ); \ + a = vec_max( a, pix1v ); \ + +/*********************************************************************** + * VEC_ADD_ABS + *********************************************************************** + * a: s16v + * b, c: s32v + * + * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi] + **********************************************************************/ +#define VEC_ADD_ABS(a,b,c) \ + VEC_ABS( a ); \ + c = vec_sum4s( a, b ) + +/*********************************************************************** + * SATD 4x4 + **********************************************************************/ +static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1, + uint8_t *pix2, int i_pix2 ) { DECLARE_ALIGNED( int, i_satd, 16 ); - LOAD_ZERO; - PREP_LOAD; - vec_s16_t pix1v, pix2v; + PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v; vec_s16_t temp0v, temp1v, temp2v, temp3v; vec_s32_t satdv; - DO_DIFF( 0, 4 ); - DO_DIFF( 1, 4 ); - DO_DIFF( 2, 4 ); - DO_DIFF( 3, 4 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); /* Hadamar H */ - vec_hadamar( diff0v, diff1v, diff2v, diff3v, + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); - vec_transpose4x4( temp0v, temp1v, temp2v, temp3v, - diff0v, diff1v, diff2v, diff3v ); + VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, + diff0v, diff1v, diff2v, diff3v ); /* Hadamar V */ - vec_hadamar( diff0v, diff1v, diff2v, diff3v, + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); - ADD_ABS_TO_SATD( 0, zero_s32v ); - ADD_ABS_TO_SATD( 1, satdv ); - ADD_ABS_TO_SATD( 2, satdv ); - ADD_ABS_TO_SATD( 3, satdv ); + VEC_ADD_ABS( temp0v, zero_s32v, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); satdv = vec_sum2s( satdv, zero_s32v ); satdv = vec_splat( satdv, 1 ); @@ -123,54 +258,152 @@ static inline int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1, return i_satd / 2; } -static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +/*********************************************************************** + * SATD 4x8 + **********************************************************************/ +static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1, + uint8_t *pix2, int i_pix2 ) { DECLARE_ALIGNED( int, i_satd, 16 ); - LOAD_ZERO; - PREP_LOAD; + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v; + vec_s16_t temp0v, temp1v, temp2v, temp3v; vec_s32_t satdv; - vec_s16_t pix1v, pix2v; + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, + diff0v, diff1v, diff2v, diff3v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_ADD_ABS( temp0v, zero_s32v, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, + diff0v, diff1v, diff2v, diff3v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_ADD_ABS( temp0v, satdv, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + + satdv = vec_sum2s( satdv, zero_s32v ); + satdv = vec_splat( satdv, 1 ); + vec_ste( satdv, 0, &i_satd ); + + return i_satd / 2; +} + +/*********************************************************************** + * SATD 8x4 + **********************************************************************/ +static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1, + uint8_t *pix2, int i_pix2 ) +{ + DECLARE_ALIGNED( int, i_satd, 16 ); + + PREP_DIFF; vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v; vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; + vec_s32_t satdv; - DO_DIFF( 0, 8 ); - DO_DIFF( 1, 8 ); - DO_DIFF( 2, 8 ); - DO_DIFF( 3, 8 ); - DO_DIFF( 4, 8 ); - DO_DIFF( 5, 8 ); - DO_DIFF( 6, 8 ); - DO_DIFF( 7, 8 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); - /* Hadamar H */ - vec_hadamar( diff0v, diff1v, diff2v, diff3v, + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + /* This causes warnings because temp4v...temp7v haven't be set, + but we don't care */ + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); - vec_hadamar( diff4v, diff5v, diff6v, diff7v, + VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); - vec_transpose8x8( temp0v, temp1v, temp2v, temp3v, - temp4v, temp5v, temp6v, temp7v, - diff0v, diff1v, diff2v, diff3v, - diff4v, diff5v, diff6v, diff7v ); + VEC_ADD_ABS( temp0v, zero_s32v, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); - /* Hadamar V */ - vec_hadamar( diff0v, diff1v, diff2v, diff3v, + satdv = vec_sum2s( satdv, zero_s32v ); + satdv = vec_splat( satdv, 1 ); + vec_ste( satdv, 0, &i_satd ); + + return i_satd / 2; +} + +/*********************************************************************** + * SATD 8x8 + **********************************************************************/ +static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1, + uint8_t *pix2, int i_pix2 ) +{ + DECLARE_ALIGNED( int, i_satd, 16 ); + + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + vec_s32_t satdv; + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); + + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v ); + + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v ); + + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); - vec_hadamar( diff4v, diff5v, diff6v, diff7v, + VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v ); - ADD_ABS_TO_SATD( 0, zero_s32v ); - ADD_ABS_TO_SATD( 1, satdv ); - ADD_ABS_TO_SATD( 2, satdv ); - ADD_ABS_TO_SATD( 3, satdv ); - ADD_ABS_TO_SATD( 4, satdv ); - ADD_ABS_TO_SATD( 5, satdv ); - ADD_ABS_TO_SATD( 6, satdv ); - ADD_ABS_TO_SATD( 7, satdv ); + VEC_ADD_ABS( temp0v, zero_s32v, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); satdv = vec_sums( satdv, zero_s32v ); satdv = vec_splat( satdv, 3 ); @@ -179,46 +412,291 @@ static inline int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1, return i_satd / 2; } -static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +/*********************************************************************** + * SATD 8x16 + **********************************************************************/ +static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1, + uint8_t *pix2, int i_pix2 ) { - return pixel_satd_4x4_altivec( &pix1[0], i_pix1, - &pix2[0], i_pix2 ) + - pixel_satd_4x4_altivec( &pix1[4], i_pix1, - &pix2[4], i_pix2 ); -} + DECLARE_ALIGNED( int, i_satd, 16 ); -static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) -{ - return pixel_satd_4x4_altivec( &pix1[0], i_pix1, - &pix2[0], i_pix2 ) + - pixel_satd_4x4_altivec( &pix1[4*i_pix1], i_pix1, - &pix2[4*i_pix2], i_pix2 ); -} -static inline int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) -{ - return pixel_satd_8x8_altivec( &pix1[0], i_pix1, - &pix2[0], i_pix2 ) + - pixel_satd_8x8_altivec( &pix1[8], i_pix1, - &pix2[8], i_pix2 ); + PREP_DIFF; + vec_s16_t diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + vec_s32_t satdv; + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_ADD_ABS( temp0v, zero_s32v, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diff0v, diff1v, diff2v, diff3v, + diff4v, diff5v, diff6v, diff7v ); + VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_ADD_ABS( temp0v, satdv, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + + satdv = vec_sums( satdv, zero_s32v ); + satdv = vec_splat( satdv, 3 ); + vec_ste( satdv, 0, &i_satd ); + + return i_satd / 2; } -static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1, + +/*********************************************************************** + * SATD 16x8 + **********************************************************************/ +static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { - return pixel_satd_8x8_altivec( &pix1[0], i_pix1, - &pix2[0], i_pix2 ) + - pixel_satd_8x8_altivec( &pix1[8*i_pix1], i_pix1, - &pix2[8*i_pix2], i_pix2 ); + DECLARE_ALIGNED( int, i_satd, 16 ); + + LOAD_ZERO; + PREP_LOAD; + vec_s32_t satdv; + vec_s16_t pix1v, pix2v; + vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v; + vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v ); + + VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v ); + + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v ); + + VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v ); + + VEC_ADD_ABS( temp0v, zero_s32v, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + + VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v ); + + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v ); + + VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v ); + + VEC_ADD_ABS( temp0v, satdv, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + + satdv = vec_sums( satdv, zero_s32v ); + satdv = vec_splat( satdv, 3 ); + vec_ste( satdv, 0, &i_satd ); + + return i_satd / 2; } + +/*********************************************************************** + * SATD 16x16 + **********************************************************************/ static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { - return pixel_satd_16x8_altivec( &pix1[0], i_pix1, - &pix2[0], i_pix2 ) + - pixel_satd_16x8_altivec( &pix1[8*i_pix1], i_pix1, - &pix2[8*i_pix2], i_pix2 ); + DECLARE_ALIGNED( int, i_satd, 16 ); + + LOAD_ZERO; + PREP_LOAD; + vec_s32_t satdv; + vec_s16_t pix1v, pix2v; + vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v; + vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v; + vec_s16_t temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v; + + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v ); + VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v ); + VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_ADD_ABS( temp0v, zero_s32v, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v ); + VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_ADD_ABS( temp0v, satdv, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v ); + VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffh0v, diffh1v, diffh2v, diffh3v, + diffh4v, diffh5v, diffh6v, diffh7v ); + VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_ADD_ABS( temp0v, satdv, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v, + temp4v, temp5v, temp6v, temp7v, + diffl0v, diffl1v, diffl2v, diffl3v, + diffl4v, diffl5v, diffl6v, diffl7v ); + VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v, + temp0v, temp1v, temp2v, temp3v ); + VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v, + temp4v, temp5v, temp6v, temp7v ); + VEC_ADD_ABS( temp0v, satdv, satdv ); + VEC_ADD_ABS( temp1v, satdv, satdv ); + VEC_ADD_ABS( temp2v, satdv, satdv ); + VEC_ADD_ABS( temp3v, satdv, satdv ); + VEC_ADD_ABS( temp4v, satdv, satdv ); + VEC_ADD_ABS( temp5v, satdv, satdv ); + VEC_ADD_ABS( temp6v, satdv, satdv ); + VEC_ADD_ABS( temp7v, satdv, satdv ); + + satdv = vec_sums( satdv, zero_s32v ); + satdv = vec_splat( satdv, 3 ); + vec_ste( satdv, 0, &i_satd ); + + return i_satd / 2; } /**************************************************************************** diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h index ff05942b..b298265e 100644 --- a/common/ppc/ppccommon.h +++ b/common/ppc/ppccommon.h @@ -134,67 +134,3 @@ _hv = vec_sel( _hv, v, _tmp2v ); \ vec_st( _hv, 0, p ) -/*********************************************************************** - * vec_transpose8x8 - **********************************************************************/ -#define vec_transpose8x8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \ - b0 = vec_mergeh( a0, a4 ); \ - b1 = vec_mergel( a0, a4 ); \ - b2 = vec_mergeh( a1, a5 ); \ - b3 = vec_mergel( a1, a5 ); \ - b4 = vec_mergeh( a2, a6 ); \ - b5 = vec_mergel( a2, a6 ); \ - b6 = vec_mergeh( a3, a7 ); \ - b7 = vec_mergel( a3, a7 ); \ - a0 = vec_mergeh( b0, b4 ); \ - a1 = vec_mergel( b0, b4 ); \ - a2 = vec_mergeh( b1, b5 ); \ - a3 = vec_mergel( b1, b5 ); \ - a4 = vec_mergeh( b2, b6 ); \ - a5 = vec_mergel( b2, b6 ); \ - a6 = vec_mergeh( b3, b7 ); \ - a7 = vec_mergel( b3, b7 ); \ - b0 = vec_mergeh( a0, a4 ); \ - b1 = vec_mergel( a0, a4 ); \ - b2 = vec_mergeh( a1, a5 ); \ - b3 = vec_mergel( a1, a5 ); \ - b4 = vec_mergeh( a2, a6 ); \ - b5 = vec_mergel( a2, a6 ); \ - b6 = vec_mergeh( a3, a7 ); \ - b7 = vec_mergel( a3, a7 ) - -/*********************************************************************** - * vec_transpose4x4 - **********************************************************************/ -#define vec_transpose4x4(a0,a1,a2,a3,b0,b1,b2,b3) \ - b0 = vec_mergeh( a0, a0 ); \ - b1 = vec_mergeh( a1, a0 ); \ - b2 = vec_mergeh( a2, a0 ); \ - b3 = vec_mergeh( a3, a0 ); \ - a0 = vec_mergeh( b0, b2 ); \ - a1 = vec_mergel( b0, b2 ); \ - a2 = vec_mergeh( b1, b3 ); \ - a3 = vec_mergel( b1, b3 ); \ - b0 = vec_mergeh( a0, a2 ); \ - b1 = vec_mergel( a0, a2 ); \ - b2 = vec_mergeh( a1, a3 ); \ - b3 = vec_mergel( a1, a3 ) - -/*********************************************************************** - * vec_hadamar - *********************************************************************** - * b[0] = a[0] + a[1] + a[2] + a[3] - * b[1] = a[0] + a[1] - a[2] - a[3] - * b[2] = a[0] - a[1] - a[2] + a[3] - * b[3] = a[0] - a[1] + a[2] - a[3] - **********************************************************************/ -#define vec_hadamar(a0,a1,a2,a3,b0,b1,b2,b3) \ - b2 = vec_add( a0, a1 ); \ - b3 = vec_add( a2, a3 ); \ - a0 = vec_sub( a0, a1 ); \ - a2 = vec_sub( a2, a3 ); \ - b0 = vec_add( b2, b3 ); \ - b1 = vec_sub( b2, b3 ); \ - b2 = vec_sub( a0, a2 ); \ - b3 = vec_add( a0, a2 ) - -- 2.40.0