static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
- uint8_t *src2, int i_height )
+ uint8_t *src1, int i_src1,
+ uint8_t *src2, int i_height )
{
int x, y;
for( y = 0; y < i_height; y++ )
}
static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
- uint8_t *src2, int i_height )
+ uint8_t *src1, int i_src1,
+ uint8_t *src2, int i_height )
{
int y;
vec_u8_t src1v, src2v;
}
static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
- uint8_t *src2, int i_height )
+ uint8_t *src1, int i_src1,
+ uint8_t *src2, int i_height )
{
int y;
vec_u8_t src1v, src2v;
}
static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
- uint8_t *src2, int i_height )
+ uint8_t *src1, int i_src1,
+ uint8_t *src2, int i_height )
{
x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
MC_COPY( x264_mc_copy_w4_altivec, 4 )
MC_COPY( x264_mc_copy_w8_altivec, 8 )
-static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
- uint8_t *src, int i_src, int i_height )
+static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
+ uint8_t *src, int i_src, int i_height )
{
int y;
vec_u8_t cpyV;
{
VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
vec_st(cpyV, 0, dst);
-
+
src += i_src;
dst += i_dst;
}
static void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
- uint8_t *src[4], int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height )
+ uint8_t *src[4], int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
-
+
switch(i_width) {
case 4:
x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride,
- uint8_t *src[4], int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height )
+ uint8_t *src[4], int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
src += (mvy >> 3) * i_src_stride + (mvx >> 3);
srcp = &src[i_src_stride];
-
+
LOAD_ZERO;
PREP_LOAD;
PREP_LOAD_SRC( src );
vec_u8_t permv;
vec_u16_t shiftv;
vec_u16_t k32v;
-
+
coeff0v = vec_ld( 0, coeff );
coeff3v = vec_splat( coeff0v, 3 );
coeff2v = vec_splat( coeff0v, 2 );
src += (mvy >> 3) * i_src_stride + (mvx >> 3);
srcp = &src[i_src_stride];
-
+
LOAD_ZERO;
PREP_LOAD;
PREP_LOAD_SRC( src );
vec_u8_t permv;
vec_u16_t shiftv;
vec_u16_t k32v;
-
+
coeff0v = vec_ld( 0, coeff );
coeff3v = vec_splat( coeff0v, 3 );
coeff2v = vec_splat( coeff0v, 2 );
t1v = vec_add( t1v, t3v ); /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
}
-#define HPEL_FILTER_HORIZONTAL() \
-{ \
+#define HPEL_FILTER_HORIZONTAL() \
+{ \
VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
- \
- src2v = vec_sld( src1v, src6v, 1 ); \
- src3v = vec_sld( src1v, src6v, 2 ); \
- src4v = vec_sld( src1v, src6v, 3 ); \
- src5v = vec_sld( src1v, src6v, 4 ); \
- src6v = vec_sld( src1v, src6v, 5 ); \
- \
- temp1v = vec_u8_to_s16_h( src1v ); \
- temp2v = vec_u8_to_s16_h( src2v ); \
- temp3v = vec_u8_to_s16_h( src3v ); \
- temp4v = vec_u8_to_s16_h( src4v ); \
- temp5v = vec_u8_to_s16_h( src5v ); \
- temp6v = vec_u8_to_s16_h( src6v ); \
- \
- HPEL_FILTER_1( temp1v, temp2v, temp3v, \
- temp4v, temp5v, temp6v ); \
- \
- dest1v = vec_add( temp1v, sixteenv ); \
- dest1v = vec_sra( dest1v, fivev ); \
- \
- temp1v = vec_u8_to_s16_l( src1v ); \
- temp2v = vec_u8_to_s16_l( src2v ); \
- temp3v = vec_u8_to_s16_l( src3v ); \
- temp4v = vec_u8_to_s16_l( src4v ); \
- temp5v = vec_u8_to_s16_l( src5v ); \
- temp6v = vec_u8_to_s16_l( src6v ); \
- \
- HPEL_FILTER_1( temp1v, temp2v, temp3v, \
- temp4v, temp5v, temp6v ); \
- \
- dest2v = vec_add( temp1v, sixteenv ); \
- dest2v = vec_sra( dest2v, fivev ); \
- \
- destv = vec_packsu( dest1v, dest2v ); \
- \
- VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \
-}
-
-#define HPEL_FILTER_VERTICAL() \
-{ \
- VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
- VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
+ \
+ src2v = vec_sld( src1v, src6v, 1 ); \
+ src3v = vec_sld( src1v, src6v, 2 ); \
+ src4v = vec_sld( src1v, src6v, 3 ); \
+ src5v = vec_sld( src1v, src6v, 4 ); \
+ src6v = vec_sld( src1v, src6v, 5 ); \
\
temp1v = vec_u8_to_s16_h( src1v ); \
temp2v = vec_u8_to_s16_h( src2v ); \
dest1v = vec_add( temp1v, sixteenv ); \
dest1v = vec_sra( dest1v, fivev ); \
\
- temp4v = vec_u8_to_s16_l( src1v ); \
- temp5v = vec_u8_to_s16_l( src2v ); \
- temp6v = vec_u8_to_s16_l( src3v ); \
- temp7v = vec_u8_to_s16_l( src4v ); \
- temp8v = vec_u8_to_s16_l( src5v ); \
- temp9v = vec_u8_to_s16_l( src6v ); \
+ temp1v = vec_u8_to_s16_l( src1v ); \
+ temp2v = vec_u8_to_s16_l( src2v ); \
+ temp3v = vec_u8_to_s16_l( src3v ); \
+ temp4v = vec_u8_to_s16_l( src4v ); \
+ temp5v = vec_u8_to_s16_l( src5v ); \
+ temp6v = vec_u8_to_s16_l( src6v ); \
\
- HPEL_FILTER_1( temp4v, temp5v, temp6v, \
- temp7v, temp8v, temp9v ); \
+ HPEL_FILTER_1( temp1v, temp2v, temp3v, \
+ temp4v, temp5v, temp6v ); \
\
- dest2v = vec_add( temp4v, sixteenv ); \
+ dest2v = vec_add( temp1v, sixteenv ); \
dest2v = vec_sra( dest2v, fivev ); \
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \
+ VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \
+}
+
+#define HPEL_FILTER_VERTICAL() \
+{ \
+ VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
+ \
+ temp1v = vec_u8_to_s16_h( src1v ); \
+ temp2v = vec_u8_to_s16_h( src2v ); \
+ temp3v = vec_u8_to_s16_h( src3v ); \
+ temp4v = vec_u8_to_s16_h( src4v ); \
+ temp5v = vec_u8_to_s16_h( src5v ); \
+ temp6v = vec_u8_to_s16_h( src6v ); \
+ \
+ HPEL_FILTER_1( temp1v, temp2v, temp3v, \
+ temp4v, temp5v, temp6v ); \
+ \
+ dest1v = vec_add( temp1v, sixteenv ); \
+ dest1v = vec_sra( dest1v, fivev ); \
+ \
+ temp4v = vec_u8_to_s16_l( src1v ); \
+ temp5v = vec_u8_to_s16_l( src2v ); \
+ temp6v = vec_u8_to_s16_l( src3v ); \
+ temp7v = vec_u8_to_s16_l( src4v ); \
+ temp8v = vec_u8_to_s16_l( src5v ); \
+ temp9v = vec_u8_to_s16_l( src6v ); \
+ \
+ HPEL_FILTER_1( temp4v, temp5v, temp6v, \
+ temp7v, temp8v, temp9v ); \
+ \
+ dest2v = vec_add( temp4v, sixteenv ); \
+ dest2v = vec_sra( dest2v, fivev ); \
+ \
+ destv = vec_packsu( dest1v, dest2v ); \
+ \
+ VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \
}
-#define HPEL_FILTER_CENTRAL() \
-{ \
- temp1v = vec_sld( tempav, tempbv, 12 ); \
- temp2v = vec_sld( tempav, tempbv, 14 ); \
- temp3v = tempbv; \
- temp4v = vec_sld( tempbv, tempcv, 2 ); \
- temp5v = vec_sld( tempbv, tempcv, 4 ); \
- temp6v = vec_sld( tempbv, tempcv, 6 ); \
- \
- HPEL_FILTER_2( temp1v, temp2v, temp3v, \
- temp4v, temp5v, temp6v ); \
- \
- dest1v = vec_add( temp1v, thirtytwov ); \
- dest1v = vec_sra( dest1v, sixv ); \
- \
- temp1v = vec_sld( tempbv, tempcv, 12 ); \
- temp2v = vec_sld( tempbv, tempcv, 14 ); \
- temp3v = tempcv; \
- temp4v = vec_sld( tempcv, tempdv, 2 ); \
- temp5v = vec_sld( tempcv, tempdv, 4 ); \
- temp6v = vec_sld( tempcv, tempdv, 6 ); \
- \
- HPEL_FILTER_2( temp1v, temp2v, temp3v, \
- temp4v, temp5v, temp6v ); \
- \
- dest2v = vec_add( temp1v, thirtytwov ); \
- dest2v = vec_sra( dest2v, sixv ); \
- \
- destv = vec_packsu( dest1v, dest2v ); \
- \
+#define HPEL_FILTER_CENTRAL() \
+{ \
+ temp1v = vec_sld( tempav, tempbv, 12 ); \
+ temp2v = vec_sld( tempav, tempbv, 14 ); \
+ temp3v = tempbv; \
+ temp4v = vec_sld( tempbv, tempcv, 2 ); \
+ temp5v = vec_sld( tempbv, tempcv, 4 ); \
+ temp6v = vec_sld( tempbv, tempcv, 6 ); \
+ \
+ HPEL_FILTER_2( temp1v, temp2v, temp3v, \
+ temp4v, temp5v, temp6v ); \
+ \
+ dest1v = vec_add( temp1v, thirtytwov ); \
+ dest1v = vec_sra( dest1v, sixv ); \
+ \
+ temp1v = vec_sld( tempbv, tempcv, 12 ); \
+ temp2v = vec_sld( tempbv, tempcv, 14 ); \
+ temp3v = tempcv; \
+ temp4v = vec_sld( tempcv, tempdv, 2 ); \
+ temp5v = vec_sld( tempcv, tempdv, 4 ); \
+ temp6v = vec_sld( tempcv, tempdv, 6 ); \
+ \
+ HPEL_FILTER_2( temp1v, temp2v, temp3v, \
+ temp4v, temp5v, temp6v ); \
+ \
+ dest2v = vec_add( temp1v, thirtytwov ); \
+ dest2v = vec_sra( dest2v, sixv ); \
+ \
+ destv = vec_packsu( dest1v, dest2v ); \
+ \
VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
}
temp6v = vec_u8_to_s16_h( src6v );
HPEL_FILTER_1( temp1v, temp2v, temp3v,
- temp4v, temp5v, temp6v );
+ temp4v, temp5v, temp6v );
/* central_filter */
tempav = tempcv;
* a: s16v
*
* a = abs(a)
- *
+ *
* Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
* actually also calls vec_splat(0), but we already have a null vector.
**********************************************************************/
-#define VEC_ABS(a) \
- a = vec_max( a, vec_sub( zero_s16v, a ) );
+#define VEC_ABS(a) \
+ a = vec_max( a, vec_sub( zero_s16v, a ) );
/***********************************************************************
* VEC_ADD_ABS
* c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
**********************************************************************/
#define VEC_ADD_ABS(a,b,c) \
- VEC_ABS( a ); \
+ VEC_ABS( a ); \
c = vec_sum4s( a, b )
/***********************************************************************
vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
/* Hadamar H */
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
-
+
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
diff0v, diff1v, diff2v, diff3v );
/* Hadamar V */
vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
-
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
* Interleaved SAD routines
**********************************************************************/
-static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
+static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
+ uint8_t *pix0, uint8_t *pix1,
+ uint8_t *pix2, uint8_t *pix3,
+ int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
int y;
-
+
LOAD_ZERO;
vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
//vec_u8_t perm0v, perm1v, perm2v, perm3v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
+
vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
-
+
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
perm3vA = vec_lvsl(0, pix3);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-
+
for (y = 0; y < 8; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
- sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
+ sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
- sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
+ sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
+
+
}
-
+
sum0v = vec_sums( sum0v, zero_s32v );
sum1v = vec_sums( sum1v, zero_s32v );
sum2v = vec_sums( sum2v, zero_s32v );
sum3v = vec_sums( sum3v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 3 );
sum1v = vec_splat( sum1v, 3 );
sum2v = vec_splat( sum2v, 3 );
sum3v = vec_splat( sum3v, 3 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
vec_ste( sum3v, 0, &sum3);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
scores[3] = sum3;
-
-
-
}
-static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
+static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
+ uint8_t *pix1, uint8_t *pix2,
+ int i_stride, int scores[3] )
{
-
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
int y;
-
+
LOAD_ZERO;
vec_u8_t temp_lv, temp_hv; // temporary load vectors
vec_u8_t fencv, pix0v, pix1v, pix2v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
+
vec_s32_t sum0v, sum1v, sum2v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
-
+
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
-
+
for (y = 0; y < 8; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
}
-
+
sum0v = vec_sums( sum0v, zero_s32v );
sum1v = vec_sums( sum1v, zero_s32v );
sum2v = vec_sums( sum2v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 3 );
sum1v = vec_splat( sum1v, 3 );
sum2v = vec_splat( sum2v, 3 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
-
-}
+}
static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
int y;
-
+
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
+ vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
-
+
vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
-
+
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
perm3vA = vec_lvsl(0, pix3);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-
-
+
for (y = 0; y < 4; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
- sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
+ sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
-
}
-
+
sum0v = vec_sums( sum0v, zero_s32v );
sum1v = vec_sums( sum1v, zero_s32v );
sum2v = vec_sums( sum2v, zero_s32v );
sum3v = vec_sums( sum3v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 3 );
sum1v = vec_splat( sum1v, 3 );
sum2v = vec_splat( sum2v, 3 );
sum3v = vec_splat( sum3v, 3 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
vec_ste( sum3v, 0, &sum3);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
scores[3] = sum3;
-
-
-
}
-static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
+static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
+ uint8_t *pix1, uint8_t *pix2,
+ int i_stride, int scores[3] )
{
-
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
int y;
-
+
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
+ vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
-
+
vec_s32_t sum0v, sum1v, sum2v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
-
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
-
+
for (y = 0; y < 4; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
fencv = vec_ld(0, fenc);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-
}
-
+
sum0v = vec_sums( sum0v, zero_s32v );
sum1v = vec_sums( sum1v, zero_s32v );
sum2v = vec_sums( sum2v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 3 );
sum1v = vec_splat( sum1v, 3 );
sum2v = vec_splat( sum2v, 3 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
-
-}
+}
-static void pixel_sad_x4_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
+static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
+ uint8_t *pix0, uint8_t *pix1,
+ uint8_t *pix2, uint8_t *pix3,
+ int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
int y;
-
+
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
+ vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
+
vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
-
+
permEncv = vec_lvsl(0, fenc);
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
perm3vA = vec_lvsl(0, pix3);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-
+
for (y = 0; y < 8; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
- sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
+ sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
- sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
+ sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
}
-
+
sum0v = vec_sum2s( sum0v, zero_s32v );
sum1v = vec_sum2s( sum1v, zero_s32v );
sum2v = vec_sum2s( sum2v, zero_s32v );
sum3v = vec_sum2s( sum3v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 1 );
sum1v = vec_splat( sum1v, 1 );
sum2v = vec_splat( sum2v, 1 );
sum3v = vec_splat( sum3v, 1 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
vec_ste( sum3v, 0, &sum3);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
- scores[3] = sum3;
-
+ scores[3] = sum3;
}
-static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
+static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
+ uint8_t *pix1, uint8_t *pix2,
+ int i_stride, int scores[3] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
int y;
-
+
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
+ vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
-
+
vec_s32_t sum0v, sum1v, sum2v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
-
+
permEncv = vec_lvsl(0, fenc);
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
-
+
for (y = 0; y < 8; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
}
-
-
+
sum0v = vec_sum2s( sum0v, zero_s32v );
sum1v = vec_sum2s( sum1v, zero_s32v );
sum2v = vec_sum2s( sum2v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 1 );
sum1v = vec_splat( sum1v, 1 );
sum2v = vec_splat( sum2v, 1 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
-
}
-static void pixel_sad_x4_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
+static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
+ uint8_t *pix0, uint8_t *pix1,
+ uint8_t *pix2, uint8_t *pix3,
+ int i_stride, int scores[4] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
DECLARE_ALIGNED_16( int sum3 );
int y;
-
+
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
+ vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
-
+
vec_s32_t sum0v, sum1v, sum2v, sum3v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
sum3v = vec_splat_s32(0);
-
+
permEncv = vec_lvsl(0, fenc);
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
perm3vA = vec_lvsl(0, pix3);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
perm3vB = vec_lvsl(0, pix3 + i_stride);
-
-
+
for (y = 0; y < 4; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
- sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
+ sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
+
temp_lv = vec_ld(0, pix3);
temp_hv = vec_ld(16, pix3);
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
pix3 += i_stride;
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
- sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
+ sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
}
-
-
+
sum0v = vec_sum2s( sum0v, zero_s32v );
sum1v = vec_sum2s( sum1v, zero_s32v );
sum2v = vec_sum2s( sum2v, zero_s32v );
sum3v = vec_sum2s( sum3v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 1 );
sum1v = vec_splat( sum1v, 1 );
sum2v = vec_splat( sum2v, 1 );
sum3v = vec_splat( sum3v, 1 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
vec_ste( sum3v, 0, &sum3);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
- scores[3] = sum3;
-
-
+ scores[3] = sum3;
}
-static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )
+static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
+ uint8_t *pix1, uint8_t *pix2,
+ int i_stride, int scores[3] )
{
DECLARE_ALIGNED_16( int sum0 );
DECLARE_ALIGNED_16( int sum1 );
DECLARE_ALIGNED_16( int sum2 );
int y;
-
+
LOAD_ZERO;
- vec_u8_t temp_lv, temp_hv;
+ vec_u8_t temp_lv, temp_hv;
vec_u8_t fencv, pix0v, pix1v, pix2v;
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB, permEncv;
-
+
vec_s32_t sum0v, sum1v, sum2v;
-
+
sum0v = vec_splat_s32(0);
sum1v = vec_splat_s32(0);
sum2v = vec_splat_s32(0);
-
+
permEncv = vec_lvsl(0, fenc);
perm0vA = vec_lvsl(0, pix0);
perm1vA = vec_lvsl(0, pix1);
perm2vA = vec_lvsl(0, pix2);
-
+
perm0vB = vec_lvsl(0, pix0 + i_stride);
perm1vB = vec_lvsl(0, pix1 + i_stride);
perm2vB = vec_lvsl(0, pix2 + i_stride);
-
+
for (y = 0; y < 4; y++)
{
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
+
temp_lv = vec_ld(0, pix0);
temp_hv = vec_ld(16, pix0);
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
pix0 += i_stride;
-
-
+
temp_lv = vec_ld(0, pix1);
temp_hv = vec_ld(16, pix1);
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
pix1 += i_stride;
-
+
temp_lv = vec_ld(0, fenc);
fencv = vec_perm(temp_lv, temp_hv, permEncv);
fenc += FENC_STRIDE;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
pix2 += i_stride;
-
-
- sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
-
- sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
-
- sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
-
+
+ sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
+
+ sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
+
+ sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
}
-
-
+
sum0v = vec_sum2s( sum0v, zero_s32v );
sum1v = vec_sum2s( sum1v, zero_s32v );
sum2v = vec_sum2s( sum2v, zero_s32v );
-
+
sum0v = vec_splat( sum0v, 1 );
sum1v = vec_splat( sum1v, 1 );
sum2v = vec_splat( sum2v, 1 );
-
+
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
vec_ste( sum2v, 0, &sum2);
-
+
scores[0] = sum0;
scores[1] = sum1;
scores[2] = sum2;
uint8_t *pix2, int i_stride_pix2)
{
DECLARE_ALIGNED_16( int sum );
-
+
int y;
LOAD_ZERO;
vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
vec_u8_t temp_lv, temp_hv;
vec_u8_t permA, permB;
-
+
sumv = vec_splat_u32(0);
-
+
permA = vec_lvsl(0, pix2);
permB = vec_lvsl(0, pix2 + i_stride_pix2);
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2vA = vec_perm(temp_lv, temp_hv, permA);
pix1vA = vec_ld(0, pix1);
-
+
for (y=0; y < 7; y++)
{
pix1 += i_stride_pix1;
pix2 += i_stride_pix2;
-
-
+
maxA = vec_max(pix1vA, pix2vA);
minA = vec_min(pix1vA, pix2vA);
-
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2vB = vec_perm(temp_lv, temp_hv, permB);
pix1vB = vec_ld(0, pix1);
-
-
+
diffA = vec_sub(maxA, minA);
sumv = vec_msum(diffA, diffA, sumv);
-
+
pix1 += i_stride_pix1;
pix2 += i_stride_pix2;
-
+
maxB = vec_max(pix1vB, pix2vB);
minB = vec_min(pix1vB, pix2vB);
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2vA = vec_perm(temp_lv, temp_hv, permA);
pix1vA = vec_ld(0, pix1);
-
+
diffB = vec_sub(maxB, minB);
sumv = vec_msum(diffB, diffB, sumv);
-
}
-
+
pix1 += i_stride_pix1;
pix2 += i_stride_pix2;
-
+
temp_lv = vec_ld(0, pix2);
temp_hv = vec_ld(16, pix2);
pix2vB = vec_perm(temp_lv, temp_hv, permB);
pix1vB = vec_ld(0, pix1);
-
+
maxA = vec_max(pix1vA, pix2vA);
minA = vec_min(pix1vA, pix2vA);
-
- maxB = vec_max(pix1vB, pix2vB);
- minB = vec_min(pix1vB, pix2vB);
-
+
+ maxB = vec_max(pix1vB, pix2vB);
+ minB = vec_min(pix1vB, pix2vB);
+
diffA = vec_sub(maxA, minA);
sumv = vec_msum(diffA, diffA, sumv);
-
+
diffB = vec_sub(maxB, minB);
sumv = vec_msum(diffB, diffB, sumv);
-
+
sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
sumv = vec_splat(sumv, 3);
vec_ste((vec_s32_t) sumv, 0, &sum);
return sum;
-}
+}
static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2)
{
DECLARE_ALIGNED_16( int sum );
-
+
int y;
LOAD_ZERO;
vec_u8_t pix1v, pix2v;
const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
sumv = vec_splat_u32(0);
-
+
perm1v = vec_lvsl(0, pix1);
perm2v = vec_lvsl(0, pix2);
-
+
for (y=0; y < 8; y++)
{
temp_hv = vec_ld(0, pix1);
maxv = vec_max(pix1v, pix2v);
minv = vec_min(pix1v, pix2v);
-
+
diffv = vec_sub(maxv, minv);
sumv = vec_msum(diffv, diffv, sumv);
vec_ste((vec_s32_t) sumv, 0, &sum);
return sum;
-}
+}
/**********************************************************************
* SA8D routines: sum of 8x8 Hadamard transformed differences
**********************************************************************/
/* SA8D_1D unrolled by 8 in Altivec */
-#define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v )\
-{\
- /* int a0 = SRC(0) + SRC(4) */\
- vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
- /* int a4 = SRC(0) - SRC(4) */\
- vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
- /* int a1 = SRC(1) + SRC(5) */\
- vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
- /* int a5 = SRC(1) - SRC(5) */\
- vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
- /* int a2 = SRC(2) + SRC(6) */\
- vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
- /* int a6 = SRC(2) - SRC(6) */\
- vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
- /* int a3 = SRC(3) + SRC(7) */\
- vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
- /* int a7 = SRC(3) - SRC(7) */\
- vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
-\
- /* int b0 = a0 + a2 */\
- vec_s16_t b0v = vec_add(a0v, a2v); \
- /* int b2 = a0 - a2; */\
- vec_s16_t b2v = vec_sub(a0v, a2v);\
- /* int b1 = a1 + a3; */\
- vec_s16_t b1v = vec_add(a1v, a3v); \
- /* int b3 = a1 - a3; */\
- vec_s16_t b3v = vec_sub(a1v, a3v); \
- /* int b4 = a4 + a6; */\
- vec_s16_t b4v = vec_add(a4v, a6v); \
- /* int b6 = a4 - a6; */\
- vec_s16_t b6v = vec_sub(a4v, a6v); \
- /* int b5 = a5 + a7; */\
- vec_s16_t b5v = vec_add(a5v, a7v); \
- /* int b7 = a5 - a7; */\
- vec_s16_t b7v = vec_sub(a5v, a7v); \
-\
- /* DST(0, b0 + b1) */\
- sa8d0v = vec_add(b0v, b1v); \
- /* DST(1, b0 - b1) */\
- sa8d1v = vec_sub(b0v, b1v); \
- /* DST(2, b2 + b3) */\
- sa8d2v = vec_add(b2v, b3v); \
- /* DST(3, b2 - b3) */\
- sa8d3v = vec_sub(b2v, b3v); \
- /* DST(4, b4 + b5) */\
- sa8d4v = vec_add(b4v, b5v); \
- /* DST(5, b4 - b5) */\
- sa8d5v = vec_sub(b4v, b5v); \
- /* DST(6, b6 + b7) */\
- sa8d6v = vec_add(b6v, b7v); \
- /* DST(7, b6 - b7) */\
- sa8d7v = vec_sub(b6v, b7v); \
+#define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, \
+ sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
+{ \
+ /* int a0 = SRC(0) + SRC(4) */ \
+ vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
+ /* int a4 = SRC(0) - SRC(4) */ \
+ vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
+ /* int a1 = SRC(1) + SRC(5) */ \
+ vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
+ /* int a5 = SRC(1) - SRC(5) */ \
+ vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
+ /* int a2 = SRC(2) + SRC(6) */ \
+ vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
+ /* int a6 = SRC(2) - SRC(6) */ \
+ vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
+ /* int a3 = SRC(3) + SRC(7) */ \
+ vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
+ /* int a7 = SRC(3) - SRC(7) */ \
+ vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
+ \
+ /* int b0 = a0 + a2 */ \
+ vec_s16_t b0v = vec_add(a0v, a2v); \
+ /* int b2 = a0 - a2; */ \
+ vec_s16_t b2v = vec_sub(a0v, a2v); \
+ /* int b1 = a1 + a3; */ \
+ vec_s16_t b1v = vec_add(a1v, a3v); \
+ /* int b3 = a1 - a3; */ \
+ vec_s16_t b3v = vec_sub(a1v, a3v); \
+ /* int b4 = a4 + a6; */ \
+ vec_s16_t b4v = vec_add(a4v, a6v); \
+ /* int b6 = a4 - a6; */ \
+ vec_s16_t b6v = vec_sub(a4v, a6v); \
+ /* int b5 = a5 + a7; */ \
+ vec_s16_t b5v = vec_add(a5v, a7v); \
+ /* int b7 = a5 - a7; */ \
+ vec_s16_t b7v = vec_sub(a5v, a7v); \
+ \
+ /* DST(0, b0 + b1) */ \
+ sa8d0v = vec_add(b0v, b1v); \
+ /* DST(1, b0 - b1) */ \
+ sa8d1v = vec_sub(b0v, b1v); \
+ /* DST(2, b2 + b3) */ \
+ sa8d2v = vec_add(b2v, b3v); \
+ /* DST(3, b2 - b3) */ \
+ sa8d3v = vec_sub(b2v, b3v); \
+ /* DST(4, b4 + b5) */ \
+ sa8d4v = vec_add(b4v, b5v); \
+ /* DST(5, b4 - b5) */ \
+ sa8d5v = vec_sub(b4v, b5v); \
+ /* DST(6, b6 + b7) */ \
+ sa8d6v = vec_add(b6v, b7v); \
+ /* DST(7, b6 - b7) */ \
+ sa8d7v = vec_sub(b6v, b7v); \
}
-static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1,
+ uint8_t *pix2, int i_pix2 )
{
int32_t i_satd=0;
return i_satd;
}
-static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1,
+ uint8_t *pix2, int i_pix2 )
{
int32_t i_satd;
i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
return i_satd;
}
-static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
+ uint8_t *pix2, int i_pix2 )
{
int32_t i_satd;
-
+
i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
+ pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
+ pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 )
pixf->satd[PIXEL_8x4] = pixel_satd_8x4_altivec;
pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec;
pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec;
-
+
pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec;