From: Alexandra Hajkova Date: Sun, 8 Jul 2018 18:04:43 +0000 (-0500) Subject: ppc: Add support for Power9-only vec_absd X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=44f1671369b54734db1775fe5155f17041344d8f;p=libx264 ppc: Add support for Power9-only vec_absd Increases overall encoding speed on POWER9 by 8%. --- diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c index c39f0b10..4f2df9a7 100644 --- a/common/ppc/deblock.c +++ b/common/ppc/deblock.c @@ -141,11 +141,7 @@ static inline void write16x4( uint8_t *dst, int dst_stride, // out: o = |x-y| < a static inline vec_u8_t diff_lt_altivec( register vec_u8_t x, register vec_u8_t y, register vec_u8_t a ) { - register vec_u8_t diff = vec_subs(x, y); - register vec_u8_t diffneg = vec_subs(y, x); - register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ - o = (vec_u8_t)vec_cmplt(o, a); - return o; + return (vec_u8_t)vec_cmplt(vec_absd(x, y), a); } static inline vec_u8_t h264_deblock_mask( register vec_u8_t p0, register vec_u8_t p1, register vec_u8_t q0, diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index a01d95b0..5e1a1cec 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -47,8 +47,7 @@ static int name( uint8_t *pix1, intptr_t i_pix1, \ pix1v = vec_vsx_ld( 0, pix1 ); \ pix2v = vec_vsx_ld( 0, pix2 ); \ sumv = (vec_s32_t) vec_sum4s( \ - vec_sub( vec_max( pix1v, pix2v ), \ - vec_min( pix1v, pix2v ) ), \ + vec_absd( pix1v, pix2v ), \ (vec_u32_t) sumv ); \ pix1 += i_pix1; \ pix2 += i_pix2; \ @@ -636,10 +635,10 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; @@ -656,10 +655,11 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); + } sum0v = vec_sums( sum0v, zero_s32v ); @@ -713,10 +713,9 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; @@ -730,9 +729,9 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld( 0, pix2 ); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); } sum0v = vec_sums( sum0v, zero_s32v ); @@ -786,10 +785,10 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); pix0v = vec_vsx_ld( 0, pix0 ); pix0 += i_stride; @@ -806,10 +805,10 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi pix3v = vec_vsx_ld( 0, pix3 ); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); } sum0v = vec_sums( sum0v, zero_s32v ); @@ -863,9 +862,9 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); pix0v = vec_vsx_ld(0, pix0); pix0 += i_stride; @@ -879,9 +878,9 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); } sum0v = vec_sums( sum0v, zero_s32v ); @@ -938,10 +937,10 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc, pix3v = vec_vsx_ld(0, pix3); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); pix0v = vec_vsx_ld(0, pix0); pix0 += i_stride; @@ -958,10 +957,10 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc, pix3v = vec_vsx_ld(0, pix3); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); } sum0v = vec_sum2s( sum0v, zero_s32v ); @@ -1015,9 +1014,9 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); pix0v = vec_vsx_ld(0, pix0); pix0 += i_stride; @@ -1031,9 +1030,9 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); } sum0v = vec_sum2s( sum0v, zero_s32v ); @@ -1089,10 +1088,10 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc, pix3v = vec_vsx_ld(0, pix3); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); pix0v = vec_vsx_ld(0, pix0); pix0 += i_stride; @@ -1109,10 +1108,10 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc, pix3v = vec_vsx_ld(0, pix3); pix3 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); - sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); + sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); } sum0v = vec_sum2s( sum0v, zero_s32v ); @@ -1166,9 +1165,9 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); pix0v = vec_vsx_ld(0, pix0); pix0 += i_stride; @@ -1182,9 +1181,9 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0, pix2v = vec_vsx_ld(0, pix2); pix2 += i_stride; - sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v ); - sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v ); - sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v ); + sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); + sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); + sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); } sum0v = vec_sum2s( sum0v, zero_s32v ); @@ -1216,7 +1215,7 @@ static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1, LOAD_ZERO; vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB; vec_u32_t sumv; - vec_u8_t maxA, minA, diffA, maxB, minB, diffB; + vec_u8_t diffA, diffB; sumv = vec_splat_u32(0); @@ -1228,25 +1227,19 @@ static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1, pix1 += i_stride_pix1; pix2 += i_stride_pix2; - maxA = vec_max(pix1vA, pix2vA); - minA = vec_min(pix1vA, pix2vA); - pix2vB = vec_vsx_ld(0, pix2); pix1vB = vec_ld(0, pix1); - diffA = vec_sub(maxA, minA); + diffA = vec_absd(pix1vA, pix2vA); sumv = vec_msum(diffA, diffA, sumv); pix1 += i_stride_pix1; pix2 += i_stride_pix2; - maxB = vec_max(pix1vB, pix2vB); - minB = vec_min(pix1vB, pix2vB); - pix2vA = vec_vsx_ld(0, pix2); pix1vA = vec_ld(0, pix1); - diffB = vec_sub(maxB, minB); + diffB = vec_absd(pix1vB, pix2vB); sumv = vec_msum(diffB, diffB, sumv); } @@ -1256,16 +1249,10 @@ static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1, pix2vB = vec_vsx_ld(0, pix2); pix1vB = vec_ld(0, pix1); - maxA = vec_max(pix1vA, pix2vA); - minA = vec_min(pix1vA, pix2vA); - - maxB = vec_max(pix1vB, pix2vB); - minB = vec_min(pix1vB, pix2vB); - - diffA = vec_sub(maxA, minA); + diffA = vec_absd(pix1vA, pix2vA); sumv = vec_msum(diffA, diffA, sumv); - diffB = vec_sub(maxB, minB); + diffB = vec_absd(pix1vB, pix2vB); sumv = vec_msum(diffB, diffB, sumv); sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v); @@ -1282,7 +1269,7 @@ static int pixel_ssd_8x8_altivec( uint8_t *pix1, intptr_t i_stride_pix1, LOAD_ZERO; vec_u8_t pix1v, pix2v; vec_u32_t sumv; - vec_u8_t maxv, minv, diffv; + vec_u8_t diffv; const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0); @@ -1293,10 +1280,7 @@ static int pixel_ssd_8x8_altivec( uint8_t *pix1, intptr_t i_stride_pix1, pix1v = vec_vsx_ld(0, pix1); pix2v = vec_vsx_ld(0, pix2); - maxv = vec_max(pix1v, pix2v); - minv = vec_min(pix1v, pix2v); - - diffv = vec_sub(maxv, minv); + diffv = vec_absd( pix1v, pix2v ); sumv = vec_msum(diffv, diffv, sumv); pix1 += i_stride_pix1; diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h index f0e736e1..fd9d6a7d 100644 --- a/common/ppc/ppccommon.h +++ b/common/ppc/ppccommon.h @@ -305,6 +305,10 @@ p2 += i2; } while( 0 ) #endif +#ifndef __POWER9_VECTOR__ +#define vec_absd( a, b ) vec_sub( vec_max( a, b ), vec_min( a, b ) ) +#endif + // vec_xxpermdi is quite useful but some version of clang do not expose it #if !HAVE_VSX || (defined(__clang__) && __clang_major__ < 6) static const vec_u8_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,