From: Guillaume Poirier Date: Mon, 12 Nov 2007 12:47:38 +0000 (+0000) Subject: Add AltiVec implementation of quant_2x2_dc, X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=09334c1a26d8b5485f12c233242d0aaf91003aea;p=libx264 Add AltiVec implementation of quant_2x2_dc, fix Altivec implementation of quant_(4x4|8x8)(|_dc) wrt current C implementation Patch by Noboru Asai % noboru DD asai AA gmail DD com % git-svn-id: svn://svn.videolan.org/x264/trunk@683 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/ppc/quant.c b/common/ppc/quant.c index ccce8ef9..aa1990bd 100644 --- a/common/ppc/quant.c +++ b/common/ppc/quant.c @@ -37,23 +37,23 @@ typedef union { #include "quant.h" // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" -#define QUANT_16_U( dct0, dct1, quant_mf0, quant_mf1, quant_mf2, quant_mf3 ) \ -temp1v = vec_ld((dct0), *dct); \ -temp2v = vec_ld((dct1), *dct); \ -mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf0), *quant_mf), (vec_u32_t)vec_ld((quant_mf1), *quant_mf)); \ -mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf2), *quant_mf), (vec_u32_t)vec_ld((quant_mf3), *quant_mf)); \ +#define QUANT_16_U( idx0, idx1 ) \ +temp1v = vec_ld((idx0), *dct); \ +temp2v = vec_ld((idx1), *dct); \ +mfvA = vec_ld((idx0), mf); \ +mfvB = vec_ld((idx1), mf); \ +biasvA = vec_ld((idx0), bias); \ +biasvB = vec_ld((idx1), bias); \ mskA = vec_cmplt(temp1v, zerov); \ mskB = vec_cmplt(temp2v, zerov); \ coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \ coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \ +coefvA = vec_adds(coefvA, biasvA); \ +coefvB = vec_adds(coefvB, biasvB); \ multEvenvA = vec_mule(coefvA, mfvA); \ multOddvA = vec_mulo(coefvA, mfvA); \ multEvenvB = vec_mule(coefvB, mfvB); \ multOddvB = vec_mulo(coefvB, mfvB); \ -multEvenvA = vec_adds(multEvenvA, fV); \ -multOddvA = vec_adds(multOddvA, fV); \ -multEvenvB = vec_adds(multEvenvB, fV); \ -multOddvB = vec_adds(multOddvB, fV); \ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \ multOddvA = vec_sr(multOddvA, i_qbitsv); \ multEvenvB = vec_sr(multEvenvB, i_qbitsv); \ @@ -62,58 +62,53 @@ temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(mul temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \ temp1v = vec_xor(temp1v, mskA); \ temp2v = vec_xor(temp2v, mskB); \ -temp1v = vec_adds(temp1v, vec_and(mskA, one)); \ -vec_st(temp1v, (dct0), (int16_t*)dct); \ -temp2v = vec_adds(temp2v, vec_and(mskB, one)); \ -vec_st(temp2v, (dct1), (int16_t*)dct); +temp1v = vec_adds(temp1v, vec_and(mskA, one)); \ +vec_st(temp1v, (idx0), (int16_t*)dct); \ +temp2v = vec_adds(temp2v, vec_and(mskB, one)); \ +vec_st(temp2v, (idx1), (int16_t*)dct); -void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) { +void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) +{ vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; + vec_u16_t biasvA; vec_s16_t zerov, one; - vec_u32_t fV; vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_u16_t mfvB; + vec_u16_t biasvB; vec_s16_t temp1v, temp2v; vect_int_u qbits_u; - qbits_u.s[0]=i_qbits; + qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); - vect_int_u f_u; - f_u.s[0]=f; - - fV = vec_splat(f_u.v, 0); - zerov = vec_splat_s16(0); one = vec_splat_s16(1); - QUANT_16_U( 0, 16, 0, 16, 32, 48 ); + QUANT_16_U( 0, 16 ); } // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" -#define QUANT_16_U_DC( dct0, dct1 ) \ -temp1v = vec_ld((dct0), *dct); \ -temp2v = vec_ld((dct1), *dct); \ +#define QUANT_16_U_DC( idx0, idx1 ) \ +temp1v = vec_ld((idx0), *dct); \ +temp2v = vec_ld((idx1), *dct); \ mskA = vec_cmplt(temp1v, zerov); \ mskB = vec_cmplt(temp2v, zerov); \ coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \ coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \ +coefvA = vec_add(coefvA, biasv); \ +coefvB = vec_add(coefvB, biasv); \ multEvenvA = vec_mule(coefvA, mfv); \ multOddvA = vec_mulo(coefvA, mfv); \ multEvenvB = vec_mule(coefvB, mfv); \ multOddvB = vec_mulo(coefvB, mfv); \ -multEvenvA = vec_add(multEvenvA, fV); \ -multOddvA = vec_add(multOddvA, fV); \ -multEvenvB = vec_add(multEvenvB, fV); \ -multOddvB = vec_add(multOddvB, fV); \ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \ multOddvA = vec_sr(multOddvA, i_qbitsv); \ multEvenvB = vec_sr(multEvenvB, i_qbitsv); \ @@ -123,18 +118,17 @@ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(mul temp1v = vec_xor(temp1v, mskA); \ temp2v = vec_xor(temp2v, mskB); \ temp1v = vec_add(temp1v, vec_and(mskA, one)); \ -vec_st(temp1v, (dct0), (int16_t*)dct); \ +vec_st(temp1v, (idx0), (int16_t*)dct); \ temp2v = vec_add(temp2v, vec_and(mskB, one)); \ -vec_st(temp2v, (dct1), (int16_t*)dct); +vec_st(temp2v, (idx1), (int16_t*)dct); - -void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) { +void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ) +{ vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_s16_t zerov, one; - vec_u32_t fV; vector bool short mskB; vec_u16_t coefvB; @@ -143,17 +137,19 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_q vec_s16_t temp1v, temp2v; vec_u16_t mfv; + vec_u16_t biasv; + vect_ushort_u mf_u; - mf_u.s[0]=i_quant_mf; + mf_u.s[0]=mf; mfv = vec_splat( mf_u.v, 0 ); vect_int_u qbits_u; - qbits_u.s[0]=i_qbits; + qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); - vect_int_u f_u; - f_u.s[0]=f; - fV = vec_splat(f_u.v, 0); + vect_ushort_u bias_u; + bias_u.s[0]=bias; + biasv = vec_splat(bias_u.v, 0); zerov = vec_splat_s16(0); one = vec_splat_s16(1); @@ -161,38 +157,83 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_q QUANT_16_U_DC( 0, 16 ); } +// DC quant of a whole 2x2 block +#define QUANT_4_U_DC( idx0 ) \ +const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \ +temp1v = vec_ld((idx0), *dct); \ +mskA = vec_cmplt(temp1v, zerov); \ +coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \ +coefvA = vec_add(coefvA, biasv); \ +multEvenvA = vec_mule(coefvA, mfv); \ +multOddvA = vec_mulo(coefvA, mfv); \ +multEvenvA = vec_sr(multEvenvA, i_qbitsv); \ +multOddvA = vec_sr(multOddvA, i_qbitsv); \ +temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \ +temp2v = vec_xor(temp2v, mskA); \ +temp2v = vec_add(temp2v, vec_and(mskA, one)); \ +temp1v = vec_sel(temp1v, temp2v, sel); \ +vec_st(temp1v, (idx0), (int16_t*)dct); + +void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ) +{ + vector bool short mskA; + vec_u32_t i_qbitsv; + vec_u16_t coefvA; + vec_u32_t multEvenvA, multOddvA; + vec_s16_t zerov, one; + + vec_s16_t temp1v, temp2v; + + vec_u16_t mfv; + vec_u16_t biasv; + + vect_ushort_u mf_u; + mf_u.s[0]=mf; + mfv = vec_splat( mf_u.v, 0 ); + + vect_int_u qbits_u; + qbits_u.s[0]=16; + i_qbitsv = vec_splat(qbits_u.v, 0); + + vect_ushort_u bias_u; + bias_u.s[0]=bias; + biasv = vec_splat(bias_u.v, 0); + + zerov = vec_splat_s16(0); + one = vec_splat_s16(1); + + QUANT_4_U_DC(0); +} -void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) { +void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) +{ vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; + vec_u16_t biasvA; vec_s16_t zerov, one; - vec_u32_t fV; vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_u16_t mfvB; + vec_u16_t biasvB; vec_s16_t temp1v, temp2v; vect_int_u qbits_u; - qbits_u.s[0]=i_qbits; + qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); - vect_int_u f_u; - f_u.s[0]=f; - fV = vec_splat(f_u.v, 0); - zerov = vec_splat_s16(0); one = vec_splat_s16(1); int i; for ( i=0; i<4; i++ ) { - QUANT_16_U( i*2*16, i*2*16+16, i*4*16, i*4*16+16, i*4*16+32, i*4*16+48 ); + QUANT_16_U( i*2*16, i*2*16+16 ); } } diff --git a/common/ppc/quant.h b/common/ppc/quant.h index a113c541..84d39436 100644 --- a/common/ppc/quant.h +++ b/common/ppc/quant.h @@ -21,8 +21,9 @@ #ifndef _PPC_QUANT_H #define _PPC_QUANT_H 1 -void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ); -void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ); +void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); +void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); -void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ); +void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ); +void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ); #endif diff --git a/common/quant.c b/common/quant.c index 1e990cb5..48663e67 100644 --- a/common/quant.c +++ b/common/quant.c @@ -238,4 +238,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_8x8 = x264_quant_8x8_ssse3; } #endif + +#ifdef ARCH_PPC + if( cpu&X264_CPU_ALTIVEC ) { + pf->quant_2x2_dc = x264_quant_2x2_dc_altivec; + pf->quant_4x4_dc = x264_quant_4x4_dc_altivec; + pf->quant_4x4 = x264_quant_4x4_altivec; + pf->quant_8x8 = x264_quant_8x8_altivec; + } +#endif }