uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
- PREP_LOAD;
PREP_STORE8;
- PREP_LOAD_SRC( src1 );
- PREP_LOAD_SRC( src2 );
for( int y = 0; y < i_height; y++ )
{
- VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
- VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
+ src1v = vec_vsx_ld( 0, src1 );
+ src2v = vec_vsx_ld( 0, src2 );
src1v = vec_avg( src1v, src2v );
- VEC_STORE8( src1v, dst );
+
+ VEC_STORE8(src1v, dst);
dst += i_dst;
src1 += i_src1;
uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
- PREP_LOAD;
- PREP_LOAD_SRC( src1 );
- PREP_LOAD_SRC( src2 );
for( int y = 0; y < i_height; y++ )
{
- VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
- VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
+ src1v = vec_vsx_ld( 0, src1 );
+ src2v = vec_vsx_ld( 0, src2 );
src1v = vec_avg( src1v, src2v );
vec_st(src1v, 0, dst);
uint8_t *src, intptr_t i_src, int i_height )
{
vec_u8_t cpyV;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
for( int y = 0; y < i_height; y++ )
{
- VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
+ cpyV = vec_vsx_ld( 0, src );
vec_st(cpyV, 0, dst);
src += i_src;
srcp = &src[i_src_stride];
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src2v_8, dstuv, dstvv;
vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
shiftv = vec_splat_u16( 6 );
- VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, src );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
{
src0v_16 = src2v_16;
src1v_16 = src3v_16;
- VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
src0v_16 = src2v_16;
src1v_16 = src3v_16;
- VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
src2v_16 = vec_u8_to_u16( src2v_8 );
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
srcp = &src[i_src_stride];
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
PREP_STORE8;
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
#endif
- VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
- VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, src );
+ src3v_8 = vec_vsx_ld( 16, src );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
for( int y = 0; y < i_height; y += 2 )
{
src0v_8 = src2v_8;
src1v_8 = src3v_8;
- VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
- VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
+ src3v_8 = vec_vsx_ld( 16, srcp );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
src0v_8 = src2v_8;
src1v_8 = src3v_8;
- VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
- VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
+ src2v_8 = vec_vsx_ld( 0, srcp );
+ src3v_8 = vec_vsx_ld( 16, srcp );
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
#define HPEL_FILTER_HORIZONTAL() \
{ \
- VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
- VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
+ src1v = vec_vsx_ld( x- 2+i_stride*y, src ); \
+ src6v = vec_vsx_ld( x+14+i_stride*y, src ); \
\
src2v = VSLD( src1v, src6v, 1 ); \
src3v = VSLD( src1v, src6v, 2 ); \
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \
+ vec_vsx_st( destv, x+i_stride*y, dsth ); \
}
#define HPEL_FILTER_VERTICAL() \
{ \
- VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
+ src1v = vec_vsx_ld( x+i_stride*(y-2), src ); \
+ src2v = vec_vsx_ld( x+i_stride*(y-1), src ); \
+ src3v = vec_vsx_ld( x+i_stride*(y-0), src ); \
+ src4v = vec_vsx_ld( x+i_stride*(y+1), src ); \
+ src5v = vec_vsx_ld( x+i_stride*(y+2), src ); \
+ src6v = vec_vsx_ld( x+i_stride*(y+3), src ); \
\
temp1v = vec_u8_to_s16_h( src1v ); \
temp2v = vec_u8_to_s16_h( src2v ); \
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \
+ vec_vsx_st( destv, x+i_stride*y, dstv ); \
}
#define HPEL_FILTER_CENTRAL() \
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
+ vec_vsx_st( destv, x-16+i_stride*y, dstc ); \
}
void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
- PREP_LOAD;
- PREP_LOAD_SRC( src);
- PREP_STORE16;
- PREP_STORE16_DST( dsth );
LOAD_ZERO;
vec_u16_t twov, fourv, fivev, sixv;
}
/* Partial vertical filter */
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
- VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
+ src1v = vec_vsx_ld( x+i_stride*(y-2), src );
+ src2v = vec_vsx_ld( x+i_stride*(y-1), src );
+ src3v = vec_vsx_ld( x+i_stride*(y-0), src );
+ src4v = vec_vsx_ld( x+i_stride*(y+1), src );
+ src5v = vec_vsx_ld( x+i_stride*(y+2), src );
+ src6v = vec_vsx_ld( x+i_stride*(y+3), src );
temp1v = vec_u8_to_s16_h( src1v );
temp2v = vec_u8_to_s16_h( src2v );
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u8_t srcv;
vec_s16_t weightv;
vec_s16_t scalev, offsetv, denomv, roundv;
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 2, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, roundv );
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 2, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, offsetv );
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u8_t srcv;
vec_s16_t weightv;
vec_s16_t scalev, offsetv, denomv, roundv;
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 4, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, roundv );
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 4, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, offsetv );
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
PREP_STORE8;
vec_u8_t srcv;
vec_s16_t weightv;
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 8, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, roundv );
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 8, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weightv = vec_u8_to_s16( srcv );
weightv = vec_mladd( weightv, scalev, offsetv );
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( src );
vec_u8_t srcv;
vec_s16_t weight_lv, weight_hv;
vec_s16_t scalev, offsetv, denomv, roundv;
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 16, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weight_hv = vec_u8_to_s16_h( srcv );
weight_lv = vec_u8_to_s16_l( srcv );
{
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- VEC_LOAD( src, srcv, 16, vec_u8_t, src );
+ srcv = vec_vsx_ld( 0, src );
weight_hv = vec_u8_to_s16_h( srcv );
weight_lv = vec_u8_to_s16_l( srcv );
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_LOAD_SRC( src );
- vec_u8_t src_1v, src_2v, src_3v;
+ vec_u8_t srcv, srcv2;
vec_s16_t weight_lv, weight_hv, weight_3v;
vec_s16_t scalev, offsetv, denomv, roundv;
vec_s16_u loadv;
if( denom >= 1 )
{
+ int16_t round = 1 << (denom - 1);
+ vec_s16_t tab[4] = {
+ { weight->i_scale, weight->i_scale, weight->i_scale, weight->i_scale, 1, 1, 1, 1 },
+ { weight->i_offset, weight->i_offset, weight->i_offset, weight->i_offset, 0, 0, 0, 0 },
+ { denom, denom, denom, denom, 0, 0, 0, 0 },
+ { round, round, round, round, 0, 0, 0, 0 },
+ };
+
loadv.s[0] = denom;
denomv = vec_splat( loadv.v, 0 );
- loadv.s[0] = 1<<(denom - 1);
+ loadv.s[0] = round;
roundv = vec_splat( loadv.v, 0 );
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- src_1v = vec_ld( 0, src );
- src_2v = vec_ld( 16, src );
- src_3v = vec_ld( 19, src );
- src_1v = vec_perm( src_1v, src_2v, _src_ );
- src_3v = vec_perm( src_2v, src_3v, _src_ );
- weight_hv = vec_u8_to_s16_h( src_1v );
- weight_lv = vec_u8_to_s16_l( src_1v );
- weight_3v = vec_u8_to_s16_h( src_3v );
+ srcv = vec_vsx_ld( 0, src );
+ srcv2 = vec_vsx_ld( 16, src );
+
+ weight_hv = vec_u8_to_s16_h( srcv );
+ weight_lv = vec_u8_to_s16_l( srcv );
+ weight_3v = vec_u8_to_s16_h( srcv2 );
weight_hv = vec_mladd( weight_hv, scalev, roundv );
weight_lv = vec_mladd( weight_lv, scalev, roundv );
- weight_3v = vec_mladd( weight_3v, scalev, roundv );
+ weight_3v = vec_mladd( weight_3v, tab[0], tab[3] );
+
weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
- weight_3v = vec_sra( weight_3v, (vec_u16_t)denomv );
+ weight_3v = vec_sra( weight_3v, (vec_u16_t)tab[2] );
+
weight_hv = vec_add( weight_hv, offsetv );
weight_lv = vec_add( weight_lv, offsetv );
- weight_3v = vec_add( weight_3v, offsetv );
+ weight_3v = vec_add( weight_3v, tab[1] );
- src_1v = vec_packsu( weight_hv, weight_lv );
- src_3v = vec_packsu( weight_3v, zero_s16v );
- vec_st( src_1v, 0, dst );
- vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
+ srcv = vec_packsu( weight_hv, weight_lv );
+ srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ) );
+ vec_vsx_st( srcv, 0, dst );
+ vec_vsx_st( srcv2, 16, dst );
}
}
else
{
+ vec_s16_t offset_mask = { weight->i_offset, weight->i_offset, weight->i_offset,
+ weight->i_offset, 0, 0, 0, 0 };
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
{
- src_1v = vec_ld( 0, src );
- src_2v = vec_ld( 16, src );
- src_3v = vec_ld( 19, src );
- src_1v = vec_perm( src_1v, src_2v, _src_ );
- src_3v = vec_perm( src_2v, src_3v, _src_ );
- weight_hv = vec_u8_to_s16_h( src_1v );
- weight_lv = vec_u8_to_s16_l( src_1v );
- weight_3v = vec_u8_to_s16_h( src_3v );
+ srcv = vec_vsx_ld( 0, src );
+ srcv2 = vec_vsx_ld( 16, src );
+
+ weight_hv = vec_u8_to_s16_h( srcv );
+ weight_lv = vec_u8_to_s16_l( srcv );
+ weight_3v = vec_u8_to_s16_h( srcv2 );
weight_hv = vec_mladd( weight_hv, scalev, offsetv );
weight_lv = vec_mladd( weight_lv, scalev, offsetv );
- weight_3v = vec_mladd( weight_3v, scalev, offsetv );
+ weight_3v = vec_mladd( weight_3v, scalev, offset_mask );
- src_1v = vec_packsu( weight_hv, weight_lv );
- src_3v = vec_packsu( weight_3v, zero_s16v );
- vec_st( src_1v, 0, dst );
- vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
+ srcv = vec_packsu( weight_hv, weight_lv );
+ srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ) );
+ vec_vsx_st( srcv, 0, dst );
+ vec_vsx_st( srcv2, 16, dst );
}
}
}
ALIGNED_16( int sum ); \
\
LOAD_ZERO; \
- PREP_LOAD; \
vec_u8_t pix1v, pix2v; \
vec_s32_t sumv = zero_s32v; \
for( int y = 0; y < ly; y++ ) \
{ \
- VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t ); \
- VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t ); \
+ pix1v = vec_vsx_ld( 0, pix1 ); \
+ pix2v = vec_vsx_ld( 0, pix2 ); \
sumv = (vec_s32_t) vec_sum4s( \
vec_sub( vec_max( pix1v, pix2v ), \
vec_min( pix1v, pix2v ) ), \
ALIGNED_16( int i_satd );
PREP_DIFF;
- PREP_LOAD_SRC( pix1 );
vec_s16_t diff0v, diff1v, diff2v, diff3v;
vec_s16_t temp0v, temp1v, temp2v, temp3v;
vec_s32_t satdv;
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
/* Hadamar H */
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
vec_s16_t temp0v, temp1v, temp2v, temp3v;
vec_s32_t satdv;
- PREP_LOAD_SRC( pix1 );
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
VEC_ADD_ABS( temp2v, satdv, satdv );
VEC_ADD_ABS( temp3v, satdv, satdv );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
-
- PREP_LOAD_SRC( pix1 );
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
- vec_u8_t _offset1_1v_ = vec_lvsl(0, pix1);
- vec_u8_t _offset1_2v_ = vec_lvsl(0, pix1 + i_pix1);
- vec_u8_t _offset2_1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2_2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset1_2v, offset2_2v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset1_2v, offset2_2v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset1_2v, offset2_2v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1_1v, offset2_1v );
- VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset1_2v, offset2_2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
- PREP_LOAD_SRC( pix1 );
- vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
- vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
VEC_ADD_ABS( temp6v, satdv, satdv );
VEC_ADD_ABS( temp7v, satdv, satdv );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
ALIGNED_16( int i_satd );
LOAD_ZERO;
- PREP_LOAD;
- PREP_LOAD_SRC( pix2 );
vec_s32_t satdv;
vec_s16_t pix1v, pix2v;
vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
ALIGNED_16( int i_satd );
LOAD_ZERO;
- PREP_LOAD;
vec_s32_t satdv;
vec_s16_t pix1v, pix2v;
vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
diffl4v, diffl5v, diffl6v, diffl7v;
vec_s16_t temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v;
- PREP_LOAD_SRC( pix2 );
-
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- //vec_u8_t perm0v, perm1v, perm2v, perm3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld( 0, pix3 );
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld( 0, pix3 );
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv; // temporary load vectors
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
- fencv = vec_ld(0, fenc);
+ fencv = vec_ld( 0, fenc );
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld( 0, pix3 );
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld( 0, pix0 );
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld( 0, pix1 );
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld( 0, pix2 );
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld( 0, pix3 );
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 8; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
ALIGNED_16( int sum3 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
vec_s32_t sum0v, sum1v, sum2v, sum3v;
sum0v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
- perm3vA = vec_lvsl(0, pix3);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
- perm3vB = vec_lvsl(0, pix3 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- temp_lv = vec_ld(0, pix3);
- temp_hv = vec_ld(16, pix3);
- pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
+ pix3v = vec_vsx_ld(0, pix3);
pix3 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
ALIGNED_16( int sum2 );
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
- vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB, permEncv;
-
vec_s32_t sum0v, sum1v, sum2v;
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
- permEncv = vec_lvsl(0, fenc);
- perm0vA = vec_lvsl(0, pix0);
- perm1vA = vec_lvsl(0, pix1);
- perm2vA = vec_lvsl(0, pix2);
-
- perm0vB = vec_lvsl(0, pix0 + i_stride);
- perm1vB = vec_lvsl(0, pix1 + i_stride);
- perm2vB = vec_lvsl(0, pix2 + i_stride);
-
for( int y = 0; y < 4; y++ )
{
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
- temp_lv = vec_ld(0, pix0);
- temp_hv = vec_ld(16, pix0);
- pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
+ pix0v = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- temp_lv = vec_ld(0, pix1);
- temp_hv = vec_ld(16, pix1);
- pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
+ pix1v = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- temp_lv = vec_ld(0, fenc);
- fencv = vec_perm(temp_lv, temp_hv, permEncv);
+ fencv = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
+ pix2v = vec_vsx_ld(0, pix2);
pix2 += i_stride;
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
vec_u32_t sumv;
vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
- vec_u8_t temp_lv, temp_hv;
- vec_u8_t permA, permB;
sumv = vec_splat_u32(0);
- permA = vec_lvsl(0, pix2);
- permB = vec_lvsl(0, pix2 + i_stride_pix2);
-
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vA = vec_perm(temp_lv, temp_hv, permA);
+ pix2vA = vec_vsx_ld(0, pix2);
pix1vA = vec_ld(0, pix1);
for( int y = 0; y < 7; y++ )
maxA = vec_max(pix1vA, pix2vA);
minA = vec_min(pix1vA, pix2vA);
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vB = vec_perm(temp_lv, temp_hv, permB);
+ pix2vB = vec_vsx_ld(0, pix2);
pix1vB = vec_ld(0, pix1);
diffA = vec_sub(maxA, minA);
maxB = vec_max(pix1vB, pix2vB);
minB = vec_min(pix1vB, pix2vB);
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vA = vec_perm(temp_lv, temp_hv, permA);
+ pix2vA = vec_vsx_ld(0, pix2);
pix1vA = vec_ld(0, pix1);
diffB = vec_sub(maxB, minB);
pix1 += i_stride_pix1;
pix2 += i_stride_pix2;
- temp_lv = vec_ld(0, pix2);
- temp_hv = vec_ld(16, pix2);
- pix2vB = vec_perm(temp_lv, temp_hv, permB);
+ pix2vB = vec_vsx_ld(0, pix2);
pix1vB = vec_ld(0, pix1);
maxA = vec_max(pix1vA, pix2vA);
vec_u8_t pix1v, pix2v;
vec_u32_t sumv;
vec_u8_t maxv, minv, diffv;
- vec_u8_t temp_lv, temp_hv;
- vec_u8_t perm1v, perm2v;
const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
sumv = vec_splat_u32(0);
- perm1v = vec_lvsl(0, pix1);
- perm2v = vec_lvsl(0, pix2);
-
for( int y = 0; y < 8; y++ )
{
- temp_hv = vec_ld(0, pix1);
- temp_lv = vec_ld(7, pix1);
- pix1v = vec_perm(temp_hv, temp_lv, perm1v);
-
- temp_hv = vec_ld(0, pix2);
- temp_lv = vec_ld(7, pix2);
- pix2v = vec_perm(temp_hv, temp_lv, perm2v);
+ pix1v = vec_vsx_ld(0, pix1);
+ pix2v = vec_vsx_ld(0, pix2);
maxv = vec_max(pix1v, pix2v);
minv = vec_min(pix1v, pix2v);
int32_t i_satd=0;
PREP_DIFF;
- PREP_LOAD_SRC( pix1 );
- PREP_LOAD_SRC( pix2 );
vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
vec_u8_t pix1v, pix2v;
vec_u32_t s1v, s2v, ssv, s12v;
- PREP_LOAD;
- PREP_LOAD_SRC (pix1);
- PREP_LOAD_SRC (pix2);
LOAD_ZERO;
s1v = s2v = ssv = s12v = zero_u32v;
for( int y = 0; y < 4; y++ )
{
- VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
- VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );
+ pix1v = vec_vsx_ld( y*stride1, pix1 );
+ pix2v = vec_vsx_ld( y*stride2, pix2 );
s1v = vec_sum4s( pix1v, s1v );
s2v = vec_sum4s( pix2v, s2v );