int i_height )
{
int y;
- vector_u8_t src1v, src2v;
+ vec_u8_t src1v, src2v;
for( y = 0; y < i_height; y++ )
{
LOAD_16( src1, src1v );
MC_COPY( mc_copy_w16, 16 )
/* TAP_FILTER:
- a is source (vector_s16_t [6])
- b is a temporary vector_s16_t
+ a is source (vec_s16_t [6])
+ b is a temporary vec_s16_t
c is the result
c = src[0] + a[5] - 5 * ( a[1] + a[4] ) + 20 * ( a[2] + a[3] );
DECLARE_ALIGNED( int16_t, tmp[8], 16 );
LOAD_ZERO;
- vector_u8_t loadv;
- vector_s16_t srcv[6];
- vector_u8_t * _srcv = (vector_u8_t*) srcv;
- vector_s16_t dstv;
- vector_s16_t tmpv;
+ vec_u8_t loadv;
+ vec_s16_t srcv[6];
+ vec_u8_t * _srcv = (vec_u8_t*) srcv;
+ vec_s16_t dstv;
+ vec_s16_t tmpv;
for( y = 0; y < i_height; y++ )
{
for( x = 0; x < 6; x++ )
{
- _srcv[x] = vec_perm( loadv, zero_u8,
+ _srcv[x] = vec_perm( loadv, zero_u8v,
vec_lvsl( 0, (int*) x ) );
- CONVERT_U8_TO_S16( srcv[x] );
+ CONVERT_U8_TO_S16( srcv[x], srcv[x] );
}
TAP_FILTER( srcv, tmpv, dstv );
DECLARE_ALIGNED( int16_t, tmp[8], 16 );
LOAD_ZERO;
- vector_s16_t srcv[6];
- vector_u8_t * _srcv = (vector_u8_t*) srcv;
- vector_s16_t dstv;
- vector_s16_t tmpv;
+ vec_s16_t srcv[6];
+ vec_u8_t * _srcv = (vec_u8_t*) srcv;
+ vec_s16_t dstv;
+ vec_s16_t tmpv;
for( y = 0; y < i_height; y++ )
{
srcv[x] = srcv[x+1];
}
LOAD_8( &src[3*i_src], _srcv[5] );
- CONVERT_U8_TO_S16( srcv[5] );
+ CONVERT_U8_TO_S16( srcv[5], srcv[5] );
}
else
{
for( x = 0; x < 6; x++ )
{
LOAD_8( &src[(x-2)*i_src], _srcv[x] );
- CONVERT_U8_TO_S16( srcv[x] );
+ CONVERT_U8_TO_S16( srcv[x], srcv[x] );
}
}
}
}
+static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int mvx, int mvy,
+ int i_width, int i_height )
+{
+ uint8_t *srcp;
+ int x, y;
+ int d8x = mvx & 0x07;
+ int d8y = mvy & 0x07;
+
+ DECLARE_ALIGNED( uint16_t, coeff[4], 16 );
+ coeff[0] = (8-d8x)*(8-d8y);
+ coeff[1] = d8x *(8-d8y);
+ coeff[2] = (8-d8x)*d8y;
+ coeff[3] = d8x *d8y;
+
+ src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+ srcp = &src[i_src_stride];
+
+ if( i_width < 8 )
+ {
+ /* TODO: optimize */
+ for( y = 0; y < i_height; y++ )
+ {
+ for( x = 0; x < i_width; x++ )
+ {
+ dst[x] = ( coeff[0]*src[x] + coeff[1]*src[x+1] +
+ coeff[2]*srcp[x] + coeff[3]*srcp[x+1] + 32 ) >> 6;
+ }
+ dst += i_dst_stride;
+
+ src = srcp;
+ srcp += i_src_stride;
+ }
+ return;
+ }
+
+ /* We now assume that i_width == 8 */
+ LOAD_ZERO;
+ vec_u16_t coeffv[4];
+ vec_u16_t k32v;
+ vec_u8_t srcv_8[4];
+ vec_u16_t srcv_16[4];
+ vec_u8_t dstv_8;
+ vec_u16_t dstv_16;
+ vec_u8_t permv;
+ vec_u16_t shiftv;
+
+ coeffv[0] = vec_ld( 0, coeff );
+ coeffv[3] = vec_splat( coeffv[0], 3 );
+ coeffv[2] = vec_splat( coeffv[0], 2 );
+ coeffv[1] = vec_splat( coeffv[0], 1 );
+ coeffv[0] = vec_splat( coeffv[0], 0 );
+ k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
+ permv = vec_lvsl( 0, (uint8_t *) 1 );
+ shiftv = vec_splat_u16( 6 );
+
+ LOAD_16( src, srcv_8[2] );
+ srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
+
+ for( y = 0; y < i_height; y++ )
+ {
+ int i;
+
+ srcv_8[0] = srcv_8[2];
+ srcv_8[1] = srcv_8[3];
+ LOAD_16( srcp, srcv_8[2] );
+ srcv_8[3] = vec_perm( srcv_8[2], srcv_8[2], permv );
+
+ dstv_16 = k32v;
+ for( i = 0; i < 4; i++ )
+ {
+ CONVERT_U8_TO_U16( srcv_8[i], srcv_16[i] );
+ srcv_16[i] = vec_mladd( coeffv[i], srcv_16[i], zero_u16v );
+ dstv_16 = vec_add( dstv_16, srcv_16[i] );
+ }
+ dstv_16 = vec_sr( dstv_16, shiftv );
+ CONVERT_U16_TO_U8( dstv_16, dstv_8 );
+ STORE_8( dstv_8, dst );
+
+ dst += i_dst_stride;
+ srcp += i_src_stride;
+ }
+}
+
void x264_mc_altivec_init( x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma_altivec;
pf->get_ref = get_ref_altivec;
+ pf->mc_chroma = mc_chroma_altivec;
}
int y; \
DECLARE_ALIGNED( int, sum, 16 ); \
\
- LOAD_ZERO; \
- vector_u8_t pix1v, pix2v; \
- vector_s32_t sumv = zero_s32; \
+ LOAD_ZERO; \
+ vec_u8_t pix1v, pix2v; \
+ vec_s32_t sumv = zero_s32v; \
for( y = 0; y < ly; y++ ) \
{ \
LOAD_##lx( pix1, pix1v ); \
LOAD_##lx( pix2, pix2v ); \
- sumv = (vector_s32_t) vec_sum4s( \
+ sumv = (vec_s32_t) vec_sum4s( \
vec_sub( vec_max( pix1v, pix2v ), \
vec_min( pix1v, pix2v ) ), \
- (vector_u32_t) sumv ); \
+ (vec_u32_t) sumv ); \
pix1 += i_pix1; \
pix2 += i_pix2; \
} \
- sumv = vec_sum##a( sumv, zero_s32 ); \
+ sumv = vec_sum##a( sumv, zero_s32v ); \
vec_ste( vec_splat( sumv, b ), 0, &sum ); \
return sum; \
}
DECLARE_ALIGNED( int, i_satd, 16 );
LOAD_ZERO;
- vector_s32_t satdv = zero_s32;
- vector_u8_t pix1u8v, pix2u8v;
- vector_s16_t pix1s16v, pix2s16v;
- vector_s16_t diffv[8];
- vector_s16_t tmpv[8];
- vector_s16_t s01v, s23v, d01v, d23v;
+ vec_s32_t satdv = zero_s32v;
+ vec_u8_t pix1u8v, pix2u8v;
+ vec_s16_t pix1s16v, pix2s16v;
+ vec_s16_t diffv[8];
+ vec_s16_t tmpv[8];
+ vec_s16_t s01v, s23v, d01v, d23v;
/* Diff 8x8 */
for( i = 0; i < 8; i++ )
LOAD_8( pix2, pix2u8v );
/* u8 -> s16 conversion */
- pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
- pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+ CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
+ CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
diffv[i] = vec_sub( pix1s16v, pix2s16v );
{
satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
}
- satdv = vec_sums( satdv, zero_s32 );
+ satdv = vec_sums( satdv, zero_s32v );
/* Done */
vec_ste( vec_splat( satdv, 3 ), 0, &i_satd );
DECLARE_ALIGNED( int, i_satd, 16 );
LOAD_ZERO;
- vector_s32_t satdv = zero_s32;
- vector_u8_t pix1u8v, pix2u8v;
- vector_s16_t pix1s16v, pix2s16v;
- vector_s16_t diffv[4];
- vector_s16_t tmpv[4];
- vector_s16_t s01v, s23v, d01v, d23v;
+ vec_s32_t satdv = zero_s32v;
+ vec_u8_t pix1u8v, pix2u8v;
+ vec_s16_t pix1s16v, pix2s16v;
+ vec_s16_t diffv[4];
+ vec_s16_t tmpv[4];
+ vec_s16_t s01v, s23v, d01v, d23v;
/* Diff 4x8 */
for( i = 0; i < 4; i++ )
LOAD_4( pix1, pix1u8v );
LOAD_4( pix2, pix2u8v );
- /* u8 -> s16 conversion */
- pix1s16v = (vector_s16_t) vec_mergeh( zero_u8, pix1u8v );
- pix2s16v = (vector_s16_t) vec_mergeh( zero_u8, pix2u8v );
+ CONVERT_U8_TO_S16( pix1u8v, pix1s16v );
+ CONVERT_U8_TO_S16( pix2u8v, pix2s16v );
diffv[i] = vec_sub( pix1s16v, pix2s16v );
{
satdv = vec_sum4s( vec_abs( tmpv[i] ), satdv );
}
- satdv = vec_sum2s( satdv, zero_s32 );
+ satdv = vec_sum2s( satdv, zero_s32v );
/* Done */
vec_ste( vec_splat( satdv, 1 ), 0, &i_satd );
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
-/* Handy */
-#define vector_u8_t vector unsigned char
-#define vector_s16_t vector signed short
-#define vector_u32_t vector unsigned int
-#define vector_s32_t vector signed int
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+#define vec_u8_t vector unsigned char
+#define vec_s8_t vector signed char
+#define vec_u16_t vector unsigned short
+#define vec_s16_t vector signed short
+#define vec_u32_t vector unsigned int
+#define vec_s32_t vector signed int
-#define LOAD_ZERO vector_s32_t zero = vec_splat_s32( 0 )
-#define zero_u8 (vector_u8_t) zero
-#define zero_s16 (vector_s16_t) zero
-#define zero_s32 (vector_s32_t) zero
+/***********************************************************************
+ * Null vector
+ **********************************************************************/
+#define LOAD_ZERO vec_u8_t zerov = vec_splat_u8( 0 )
-#define CONVERT_U8_TO_S16( a ) \
- a = (vector_s16_t) vec_mergeh( zero_u8, (vector_u8_t) a )
+#define zero_u8v (vec_u8_t) zerov
+#define zero_s8v (vec_s8_t) zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
-/* Macros to load aligned or unaligned data without risking buffer
- overflows. */
+/***********************************************************************
+ * CONVERT_*
+ **********************************************************************/
+#define CONVERT_U8_TO_U16( s, d ) \
+ d = (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
+#define CONVERT_U8_TO_S16( s, d ) \
+ d = (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) s )
+#define CONVERT_U16_TO_U8( s, d ) \
+ d = (vec_u8_t) vec_pack( (vec_u16_t) s, zero_u16v )
+#define CONVERT_S16_TO_U8( s, d ) \
+ d = (vec_u8_t) vec_pack( (vec_s16_t) s, zero_s16v )
+
+/***********************************************************************
+ * LOAD_16
+ ***********************************************************************
+ * p: uint8_t *
+ * v: vec_u8_t
+ * Loads 16 bytes from p into v
+ **********************************************************************/
#define LOAD_16( p, v ) \
if( (long) p & 0xF ) \
{ \
v = vec_ld( 0, p ); \
}
-#define LOAD_8( p, v ) \
- if( !( (long) p & 0xF ) ) \
- { \
- v = vec_ld( 0, p ); \
- } \
- else if( ( (long) p & 0xF ) < 9 ) \
- { \
- v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
- vec_lvsl( 0, p ) ); \
- } \
- else \
- { \
- v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
- vec_lvsl( 0, p ) ); \
+/***********************************************************************
+ * LOAD_8
+ ***********************************************************************
+ * p: uint8_t *
+ * v: vec_u8_t
+ * Loads 8 bytes from p into the first half of v
+ **********************************************************************/
+#define LOAD_8( p, v ) \
+ if( !( (long) p & 0xF ) ) \
+ { \
+ v = vec_ld( 0, p ); \
+ } \
+ else if( ( (long) p & 0xF ) < 9 ) \
+ { \
+ v = vec_perm( vec_ld( 0, p ), zero_u8v, \
+ vec_lvsl( 0, p ) ); \
+ } \
+ else \
+ { \
+ v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
+ vec_lvsl( 0, p ) ); \
+ }
+
+/***********************************************************************
+ * LOAD_4
+ ***********************************************************************
+ * p: uint8_t *
+ * v: vec_u8_t
+ * Loads 4 bytes from p into the first quarter of v
+ **********************************************************************/
+#define LOAD_4( p, v ) \
+ if( !( (long) p & 0xF ) ) \
+ { \
+ v = vec_ld( 0, p ); \
+ } \
+ else if( ( (long) p & 0xF ) < 13 ) \
+ { \
+ v = vec_perm( vec_ld( 0, p ), zero_u8v, \
+ vec_lvsl( 0, p ) ); \
+ } \
+ else \
+ { \
+ v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
+ vec_lvsl( 0, p ) ); \
}
-#define LOAD_4( p, v ) \
- if( !( (long) p & 0xF ) ) \
- { \
- v = vec_ld( 0, p ); \
- } \
- else if( ( (long) p & 0xF ) < 13 ) \
- { \
- v = vec_perm( vec_ld( 0, p ), (vector unsigned char) zero, \
- vec_lvsl( 0, p ) ); \
- } \
- else \
- { \
- v = vec_perm( vec_ld( 0, p ), vec_ld( 16, p ), \
- vec_lvsl( 0, p ) ); \
+/***********************************************************************
+ * STORE_16
+ ***********************************************************************
+ * v: vec_u8_t
+ * p: uint8_t *
+ * Stores the 16 bytes from v at address p
+ **********************************************************************/
+#define STORE_16( v, p ) \
+ if( (long) p & 0xF ) \
+ { \
+ vec_u8_t hv, lv, tmp1, tmp2; \
+ hv = vec_ld( 0, p ); \
+ lv = vec_ld( 16, p ); \
+ tmp2 = vec_lvsl( 0, p ); \
+ tmp1 = vec_perm( lv, hv, tmp2 ); \
+ tmp2 = vec_lvsr( 0, p ); \
+ hv = vec_perm( tmp1, v, tmp2 ); \
+ lv = vec_perm( v, tmp1, tmp2 ); \
+ vec_st( lv, 16, p ); \
+ vec_st( hv, 0, p ); \
+ } \
+ else \
+ { \
+ vec_st( v, 0, p ); \
}
-/* Store aligned or unaligned data */
-#define STORE_16( v, p ) \
- if( (long) p & 0xF ) \
- { \
- vector unsigned char tmp1, tmp2; \
- vector unsigned char align, mask; \
- tmp1 = vec_ld( 0, p ); \
- tmp2 = vec_ld( 16, p ); \
- align = vec_lvsr( 0, p ); \
- mask = vec_perm( (vector unsigned char) {0}, \
- (vector unsigned char) {1}, \
- align); \
- v = vec_perm( v, v, align); \
- tmp1 = vec_sel( tmp1, v, mask ); \
- tmp2 = vec_sel( v, tmp2, mask ); \
- vec_st( tmp1, 0, p ); \
- vec_st( tmp2, 16, p ); \
- } \
- else \
- { \
- vec_st( v, 0, p ); \
+/* FIXME We can do better than that */
+#define STORE_8( v, p ) \
+ { \
+ DECLARE_ALIGNED( uint8_t, _p[16], 16 ); \
+ vec_st( v, 0, _p ); \
+ memcpy( p, _p, 8 ); \
}
-/* Transpose 8x8 (vector_s16_t [8]) */
+/* Transpose 8x8 (vec_s16_t [8]) */
#define TRANSPOSE8x8( a, b ) \
b[0] = vec_mergeh( a[0], a[4] ); \
b[1] = vec_mergel( a[0], a[4] ); \
b[6] = vec_mergeh( a[3], a[7] ); \
b[7] = vec_mergel( a[3], a[7] );
-/* Transpose 4x4 (vector_s16_t [4]) */
+/* Transpose 4x4 (vec_s16_t [4]) */
#define TRANSPOSE4x4( a, b ) \
- (b)[0] = vec_mergeh( (a)[0], zero_s16 ); \
- (b)[1] = vec_mergeh( (a)[1], zero_s16 ); \
- (b)[2] = vec_mergeh( (a)[2], zero_s16 ); \
- (b)[3] = vec_mergeh( (a)[3], zero_s16 ); \
+ (b)[0] = vec_mergeh( (a)[0], zero_s16v ); \
+ (b)[1] = vec_mergeh( (a)[1], zero_s16v ); \
+ (b)[2] = vec_mergeh( (a)[2], zero_s16v ); \
+ (b)[3] = vec_mergeh( (a)[3], zero_s16v ); \
(a)[0] = vec_mergeh( (b)[0], (b)[2] ); \
(a)[1] = vec_mergel( (b)[0], (b)[2] ); \
(a)[2] = vec_mergeh( (b)[1], (b)[3] ); \
(b)[2] = vec_mergeh( (a)[1], (a)[3] ); \
(b)[3] = vec_mergel( (a)[1], (a)[3] );
-/* Hadamar (vector_s16_t [4]) */
+/* Hadamar (vec_s16_t [4]) */
#define HADAMAR( a, b ) \
s01v = vec_add( (a)[0], (a)[1] ); \
s23v = vec_add( (a)[2], (a)[3] ); \
;;
Darwin)
SYS="MACOSX"
+ CFLAGS="$CFLAGS -falign-loops=16"
LDFLAGS="$LDFLAGS -lm -lmx"
;;
FreeBSD)
#endif
#ifdef ARCH_PPC
#include "common/ppc/pixel.h"
+#include "common/ppc/mc.h"
#endif
/* buf1, buf2: initialised to randome data and shouldn't write into them */
#define MC_TEST_LUMA( w, h ) \
if( mc_a.mc_luma ) \
{ \
- memset(buf1, 0xCD, 1024); \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
mc_c.mc_luma( src2, 32, dst1, 16, dx, dy, w, h ); \
mc_a.mc_luma( src2, 32, dst2, 16, dx, dy, w, h ); \
- if( memcmp( dst1, dst2, 16*16 ) ) \
+ if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
ok[0] = 0; \
#define MC_TEST_CHROMA( w, h ) \
if( mc_a.mc_chroma ) \
{ \
- memset(dst1, 0xCD, (h) * 16); \
+ memset(buf3, 0xCD, 1024); \
+ memset(buf4, 0xCD, 1024); \
mc_c.mc_chroma( src, 32, dst1, 16, dx, dy, w, h ); \
- memset(dst2, 0xCD, (h) * 16); \
mc_a.mc_chroma( src, 32, dst2, 16, dx, dy, w, h ); \
- if( memcmp( dst1, dst2, 16*16 ) ) \
+ if( memcmp( buf3, buf4, 1024 ) ) \
{ \
fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
ok[1] = 0; \