From: Guillaume Poirier Date: Mon, 12 Nov 2007 20:36:33 +0000 (+0000) Subject: Add AltiVec implementation of add4x4_idct, add8x8_idct, add16x16_idct, 3.2x faster... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a8650641e1006d9750cc97d3c1672871c4549296;p=libx264 Add AltiVec implementation of add4x4_idct, add8x8_idct, add16x16_idct, 3.2x faster on average 1.05x faster overall with default encoding options Patch by Noboru Asai % noboru DD asai AA gmail DD com % git-svn-id: svn://svn.videolan.org/x264/trunk@685 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/dct.c b/common/dct.c index 15bcfd04..0e724e42 100644 --- a/common/dct.c +++ b/common/dct.c @@ -440,6 +440,10 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub8x8_dct = x264_sub8x8_dct_altivec; dctf->sub16x16_dct = x264_sub16x16_dct_altivec; + dctf->add4x4_idct = x264_add4x4_idct_altivec; + dctf->add8x8_idct = x264_add8x8_idct_altivec; + dctf->add16x16_idct = x264_add16x16_idct_altivec; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec; dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec; diff --git a/common/ppc/dct.c b/common/ppc/dct.c index 87aab07d..87685305 100644 --- a/common/ppc/dct.c +++ b/common/ppc/dct.c @@ -233,6 +233,99 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *p * IDCT transform: ****************************************************************************/ +#define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \ +{ \ + /* a0 = SRC(0) + SRC(2); */ \ + vec_s16_t a0v = vec_add(s0, s2); \ + /* a1 = SRC(0) - SRC(2); */ \ + vec_s16_t a1v = vec_sub(s0, s2); \ + /* a2 = (SRC(1)>>1) - SRC(3); */ \ + vec_s16_t a2v = vec_sub(vec_sra(s1, onev), s3); \ + /* a3 = (SRC(3)>>1) + SRC(1); */ \ + vec_s16_t a3v = vec_add(vec_sra(s3, onev), s1); \ + /* DST(0, a0 + a3); */ \ + d0 = vec_add(a0v, a3v); \ + /* DST(1, a1 + a2); */ \ + d1 = vec_add(a1v, a2v); \ + /* DST(2, a1 - a2); */ \ + d2 = vec_sub(a1v, a2v); \ + /* DST(3, a0 - a3); */ \ + d3 = vec_sub(a0v, a3v); \ +} + +#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ + vdst_orig = vec_ld(0, dst); \ + vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ + vdst_ss = (vec_s16_t)vec_mergeh(zero_u8v, vdst); \ + va = vec_add(va, vdst_ss); \ + va_u8 = vec_s16_to_u8(va); \ + va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ + vec_ste(va_u32, element, (uint32_t*)dst); + +#define ALTIVEC_STORE4_SUM_CLIP(dest, idctv, perm_ldv) \ +{ \ + /* unaligned load */ \ + vec_u8_t lv = vec_ld(0, dest); \ + vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \ + vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ + vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ + vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ + vec_u8_t idstsum8 = vec_s16_to_u8(idstsum); \ + /* unaligned store */ \ + vec_u32_t bodyv = vec_splat((vec_u32_t)idstsum8, 0); \ + int element = ((unsigned long)dest & 0xf) >> 2; \ + vec_ste(bodyv, element, (uint32_t *)dest); \ +} + +void x264_add4x4_idct_altivec( uint8_t *dst, int16_t dct[4][4] ) +{ + vec_u16_t onev = vec_splat_u16(1); + + dct[0][0] += 32; // rounding for the >>6 at the end + + vec_s16_t s0, s1, s2, s3; + + s0 = vec_ld( 0x00, (int16_t*)dct ); + s1 = vec_sld( s0, s0, 8 ); + s2 = vec_ld( 0x10, (int16_t*)dct ); + s3 = vec_sld( s2, s2, 8 ); + + vec_s16_t d0, d1, d2, d3; + IDCT_1D_ALTIVEC( s0, s1, s2, s3, d0, d1, d2, d3 ); + + vec_s16_t tr0, tr1, tr2, tr3; + + VEC_TRANSPOSE_4( d0, d1, d2, d3, tr0, tr1, tr2, tr3 ); + + vec_s16_t idct0, idct1, idct2, idct3; + IDCT_1D_ALTIVEC( tr0, tr1, tr2, tr3, idct0, idct1, idct2, idct3 ); + + vec_u8_t perm_ldv = vec_lvsl( 0, dst ); + vec_u16_t sixv = vec_splat_u16(6); + LOAD_ZERO; + + ALTIVEC_STORE4_SUM_CLIP( &dst[0*FDEC_STRIDE], idct0, perm_ldv ); + ALTIVEC_STORE4_SUM_CLIP( &dst[1*FDEC_STRIDE], idct1, perm_ldv ); + ALTIVEC_STORE4_SUM_CLIP( &dst[2*FDEC_STRIDE], idct2, perm_ldv ); + ALTIVEC_STORE4_SUM_CLIP( &dst[3*FDEC_STRIDE], idct3, perm_ldv ); +} + +void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][4][4] ) +{ + x264_add4x4_idct_altivec( &p_dst[0], dct[0] ); + x264_add4x4_idct_altivec( &p_dst[4], dct[1] ); + x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2] ); + x264_add4x4_idct_altivec( &p_dst[4*FDEC_STRIDE+4], dct[3] ); +} + +void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][4][4] ) +{ + x264_add8x8_idct_altivec( &p_dst[0], &dct[0] ); + x264_add8x8_idct_altivec( &p_dst[8], &dct[4] ); + x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+0], &dct[8] ); + x264_add8x8_idct_altivec( &p_dst[8*FDEC_STRIDE+8], &dct[12] ); +} + #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7)\ {\ /* a0 = SRC(0) + SRC(4); */ \ @@ -362,4 +455,3 @@ void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][8][8] ) x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+0], dct[2] ); x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+8], dct[3] ); } - diff --git a/common/ppc/dct.h b/common/ppc/dct.h index 0e0df524..9aa60456 100644 --- a/common/ppc/dct.h +++ b/common/ppc/dct.h @@ -32,6 +32,10 @@ void x264_sub8x8_dct_altivec( int16_t dct[4][4][4], void x264_sub16x16_dct_altivec( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); +void x264_add4x4_idct_altivec( uint8_t *p_dst, int16_t dct[4][4] ); +void x264_add8x8_idct_altivec( uint8_t *p_dst, int16_t dct[4][4][4] ); +void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][4][4] ); + void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h index e13e9753..7768f8ae 100644 --- a/common/ppc/ppccommon.h +++ b/common/ppc/ppccommon.h @@ -64,7 +64,7 @@ #define vec_u8_to_s16(v) vec_u8_to_s16_h(v) #define vec_u16_to_u8(v) vec_pack( v, zero_u16v ) -#define vec_s16_to_u8(v) vec_pack( v, zero_u16v ) +#define vec_s16_to_u8(v) vec_packsu( v, zero_s16v ) /*********************************************************************** * PREP_LOAD: declares two vectors required to perform unaligned loads