From: Fiona Glaser Date: Sun, 6 Jul 2008 18:59:15 +0000 (-0600) Subject: Various optimizations and cosmetics X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c9c7edf3e6fa8fbdd4d7bf2beccb448bdcac9aa4;p=libx264 Various optimizations and cosmetics Update AUTHORS file with Gabriel and me update XCHG macro to work correctly in if statements Add new lookup tables for block_idx and fdec/fenc addresses Slightly faster array_non_zero_count_mmx (patch by holger) Eliminate branch in analyse_intra Unroll loops in and clean up chroma encode Convert some for loops to do/while loops for speed improvement Do explicit write-combining on --me tesa mvsad_t struct Shrink --me esa zero[] array Speed up bime by reducing size of visited[][][] array --- diff --git a/AUTHORS b/AUTHORS index 4e7a6003..129d0115 100644 --- a/AUTHORS +++ b/AUTHORS @@ -39,11 +39,21 @@ S: France N: Francesco Corriga D: VfW +N: Gabriel Bouvigne +E: gabriel.bouvigne AT joost DOT com +D: 2pass VBV + N: Guillaume Poirier E: gpoirier CHEZ mplayerhq POINT hu D: Altivec optimizations S: Brittany, France +N: Fiona Glaser +E: fiona AT x264 DOT com +D: x86 asm, 1pass VBV, adaptive quantization, inline asm +D: various speed optimizations, bugfixes +S: USA + N: Justin Clay E: justin_clay AT hotmail DOT com C: wheatgerm diff --git a/common/common.h b/common/common.h index 376ec1e9..4095b8d7 100644 --- a/common/common.h +++ b/common/common.h @@ -33,7 +33,7 @@ #define X264_MAX3(a,b,c) X264_MAX((a),X264_MAX((b),(c))) #define X264_MIN4(a,b,c,d) X264_MIN((a),X264_MIN3((b),(c),(d))) #define X264_MAX4(a,b,c,d) X264_MAX((a),X264_MAX3((b),(c),(d))) -#define XCHG(type,a,b) { type t = a; a = b; b = t; } +#define XCHG(type,a,b) do{ type t = a; a = b; b = t; } while(0) #define FIX8(f) ((int)(f*(1<<8)+.5)) #define CHECKED_MALLOC( var, size )\ diff --git a/common/macroblock.h b/common/macroblock.h index 14741fc2..9d9b2223 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -219,6 +219,32 @@ static const uint8_t block_idx_xy[4][4] = { 4, 6, 12, 14 }, { 5, 7, 13, 15 } }; +static const uint8_t block_idx_xy_1d[16] = +{ + 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 +}; +static const uint8_t block_idx_xy_fenc[16] = +{ + 0*4 + 0*4*FENC_STRIDE, 1*4 + 0*4*FENC_STRIDE, + 0*4 + 1*4*FENC_STRIDE, 1*4 + 1*4*FENC_STRIDE, + 2*4 + 0*4*FENC_STRIDE, 3*4 + 0*4*FENC_STRIDE, + 2*4 + 1*4*FENC_STRIDE, 3*4 + 1*4*FENC_STRIDE, + 0*4 + 2*4*FENC_STRIDE, 1*4 + 2*4*FENC_STRIDE, + 0*4 + 3*4*FENC_STRIDE, 1*4 + 3*4*FENC_STRIDE, + 2*4 + 2*4*FENC_STRIDE, 3*4 + 2*4*FENC_STRIDE, + 2*4 + 3*4*FENC_STRIDE, 3*4 + 3*4*FENC_STRIDE +}; +static const uint16_t block_idx_xy_fdec[16] = +{ + 0*4 + 0*4*FDEC_STRIDE, 1*4 + 0*4*FDEC_STRIDE, + 0*4 + 1*4*FDEC_STRIDE, 1*4 + 1*4*FDEC_STRIDE, + 2*4 + 0*4*FDEC_STRIDE, 3*4 + 0*4*FDEC_STRIDE, + 2*4 + 1*4*FDEC_STRIDE, 3*4 + 1*4*FDEC_STRIDE, + 0*4 + 2*4*FDEC_STRIDE, 1*4 + 2*4*FDEC_STRIDE, + 0*4 + 3*4*FDEC_STRIDE, 1*4 + 3*4*FDEC_STRIDE, + 2*4 + 2*4*FDEC_STRIDE, 3*4 + 2*4*FDEC_STRIDE, + 2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE +}; static const uint8_t i_chroma_qp_table[52] = { diff --git a/common/x86/util.h b/common/x86/util.h index 2158c373..7d638c27 100644 --- a/common/x86/util.h +++ b/common/x86/util.h @@ -77,24 +77,22 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t #define array_non_zero_count array_non_zero_count_mmx static inline int array_non_zero_count_mmx( int16_t *v ) { - static const uint64_t pw_2 = 0x0202020202020202ULL; int count; asm( "pxor %%mm7, %%mm7 \n" "movq (%1), %%mm0 \n" - "movq 16(%1), %%mm1 \n" - "packsswb 8(%1), %%mm0 \n" + "movq 8(%1), %%mm1 \n" + "packsswb 16(%1), %%mm0 \n" "packsswb 24(%1), %%mm1 \n" "pcmpeqb %%mm7, %%mm0 \n" "pcmpeqb %%mm7, %%mm1 \n" "paddb %%mm0, %%mm1 \n" - "paddb %2, %%mm1 \n" "psadbw %%mm7, %%mm1 \n" "movd %%mm1, %0 \n" :"=r"(count) - :"r"(v), "m"(pw_2) + :"r"(v) ); - return count; + return (count+0x10)&0xff; } #undef array_non_zero_int #define array_non_zero_int array_non_zero_int_mmx diff --git a/encoder/analyse.c b/encoder/analyse.c index 7afa8531..d22412c9 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -606,10 +606,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( b_merged_satd && i_max == 9 ) { - int satd[3]; + int satd[9]; h->pixf.intra_sa8d_x3_8x8( p_src_by, edge, satd ); - if( i_pred_mode < 3 ) - satd[i_pred_mode] -= 3 * a->i_lambda; + satd[i_pred_mode] -= 3 * a->i_lambda; for( i=2; i>=0; i-- ) { int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda; @@ -679,10 +678,8 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ for( idx = 0;; idx++ ) { - int x = block_idx_x[idx]; - int y = block_idx_y[idx]; - uint8_t *p_src_by = p_src + 4*x + 4*y*FENC_STRIDE; - uint8_t *p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE; + uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx]; + uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx]; int i_best = COST_MAX; int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); @@ -694,10 +691,9 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( b_merged_satd && i_max >= 6 ) { - int satd[3]; + int satd[9]; h->pixf.intra_satd_x3_4x4( p_src_by, p_dst_by, satd ); - if( i_pred_mode < 3 ) - satd[i_pred_mode] -= 3 * a->i_lambda; + satd[i_pred_mode] -= 3 * a->i_lambda; for( i=2; i>=0; i-- ) COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda, a->i_predict4x4[idx], i ); @@ -808,16 +804,11 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) int i_nnz = 0; for( idx = 0; idx < 16; idx++ ) { - uint8_t *p_src_by; - uint8_t *p_dst_by; + uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx]; i_best = COST_MAX; i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); - x = block_idx_x[idx]; - y = block_idx_y[idx]; - p_src_by = p_src + 4*x + 4*y*FENC_STRIDE; - p_dst_by = p_dst + 4*x + 4*y*FDEC_STRIDE; predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max ); if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 3eb29ecc..96c0db53 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -19,13 +19,12 @@ * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. *****************************************************************************/ #include "common/common.h" #include "macroblock.h" - #define ZIG(i,y,x) level[i] = dct[x][y]; static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] ) { @@ -82,10 +81,8 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max ) void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale ) { - int x = 4 * block_idx_x[idx]; - int y = 4 * block_idx_y[idx]; - uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE]; - uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE]; + uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]]; + uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]]; DECLARE_ALIGNED_16( int16_t dct4x4[4][4] ); if( h->mb.b_lossless ) @@ -147,10 +144,10 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) { for( i = 0; i < 16; i++ ) { - int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE; - int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE; + int oe = block_idx_xy_fenc[i]; + int od = block_idx_xy_fdec[i]; h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od ); - dct_dc4x4[block_idx_x[i]][block_idx_y[i]] = h->dct.luma4x4[i][0]; + dct_dc4x4[0][block_idx_xy_1d[i]] = h->dct.luma4x4[i][0]; h->dct.luma4x4[i][0] = 0; } h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 ); @@ -161,7 +158,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) for( i = 0; i < 16; i++ ) { /* copy dc coeff */ - dct_dc4x4[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0]; + dct_dc4x4[0][block_idx_xy_1d[i]] = dct4x4[i][0][0]; dct4x4[i][0][0] = 0; /* quant/scan/dequant */ @@ -186,7 +183,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) for( i = 0; i < 16; i++ ) { /* copy dc coeff */ - dct4x4[i][0][0] = dct_dc4x4[block_idx_y[i]][block_idx_x[i]]; + dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]]; } /* put pixels to fdec */ h->dctf.add16x16_idct( p_dst, dct4x4 ); @@ -224,7 +221,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) for( i = 0; i < 4; i++ ) { /* copy dc coeff */ - dct2x2[block_idx_y[i]][block_idx_x[i]] = dct4x4[i][0][0]; + dct2x2[i>>1][i&1] = dct4x4[i][0][0]; dct4x4[i][0][0] = 0; /* no trellis; it doesn't seem to help chroma noticeably */ @@ -258,9 +255,10 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) for( i = 0; i < 4; i++ ) h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qscale ); } - - for( i = 0; i < 4; i++ ) - dct4x4[i][0][0] = dct2x2[0][i]; + dct4x4[0][0][0] = dct2x2[0][0]; + dct4x4[1][0][0] = dct2x2[0][1]; + dct4x4[2][0][0] = dct2x2[1][0]; + dct4x4[3][0][0] = dct2x2[1][1]; h->dctf.add8x8_idct( p_dst, dct4x4 ); } @@ -408,7 +406,7 @@ void x264_macroblock_encode( x264_t *h ) } for( i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ ) { - uint8_t *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * FDEC_STRIDE]; + uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]]; int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]; if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP ) @@ -432,11 +430,9 @@ void x264_macroblock_encode( x264_t *h ) { for( i4x4 = 0; i4x4 < 16; i4x4++ ) { - int x = 4*block_idx_x[i4x4]; - int y = 4*block_idx_y[i4x4]; h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4], - h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE, - h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE ); + h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4], + h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] ); } } else if( h->mb.b_transform_8x8 ) diff --git a/encoder/me.c b/encoder/me.c index 5d1a2e6b..d4f3eaa6 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -196,8 +196,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 ); COST_MV_HPEL( mx, my ); } - i++; - } while( i < i_mvc ); + } while( ++i < i_mvc ); bmx = ( bpred_mx + 2 ) >> 2; bmy = ( bpred_my + 2 ) >> 2; COST_MV( bmx, bmy ); @@ -223,8 +222,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, my = x264_clip3( my, mv_y_min, mv_y_max ); COST_MV( mx, my ); } - i++; - } while( i < i_mvc ); + } while( ++i < i_mvc ); } COST_MV( 0, 0 ); @@ -232,14 +230,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, { case X264_ME_DIA: /* diamond search, radius 1 */ - for( i = 0; i < i_me_range; i++ ) + i = 0; + do { DIA1_ITER( bmx, bmy ); if( (bmx == omx) & (bmy == omy) ) break; if( !CHECK_MVRANGE(bmx, bmy) ) break; - } + } while( ++i < i_me_range ); break; case X264_ME_HEX: @@ -410,7 +409,9 @@ me_hex2: /* hexagon grid */ omx = bmx; omy = bmy; - for( i = 1; i <= i_me_range/4; i++ ) + + i = 1; + do { static const int hex4[16][2] = { {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2}, @@ -437,7 +438,7 @@ me_hex2: COST_MV_X4( 4*i, 1*i, 4*i, 2*i, 2*i, 3*i, 0*i, 4*i ); COST_MV_X4( -2*i, 3*i, -2*i,-3*i, 0*i,-4*i, 2*i,-3*i ); } - } + } while( ++i <= i_me_range/4 ); if( bmy <= mv_y_max ) goto me_hex2; break; @@ -464,7 +465,10 @@ me_hex2: * because sum(abs(diff)) >= abs(diff(sum)). */ const int stride = m->i_stride[0]; uint16_t *sums_base = m->integral; - DECLARE_ALIGNED_16( static uint8_t zero[16*16] ); + /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned. + * unlike the similar case in ratecontrol.c, this is not a problem because it is not used for any + * SSE instructions and the only loss is a tiny bit of performance. */ + DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] ); DECLARE_ALIGNED_16( int enc_dc[4] ); int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; int delta = x264_pixel_size[sad_size].w; @@ -546,7 +550,13 @@ me_hex2: for( i=0; i limit ) @@ -558,7 +568,12 @@ me_hex2: for( j=i+1; j i ) - XCHG( mvsad_t, mvsads[i], mvsads[bj] ); + { + if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) + XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] ); + else + XCHG( mvsad_t, mvsads[i], mvsads[bj] ); + } } nmvsad = limit; } @@ -781,12 +796,12 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite BIME_CACHE(-(a),-(b)) #define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \ -if( pass == 0 || !visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] ) \ +if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \ { \ int cost; \ int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \ int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \ - visited[(m0x)&7][(m0y)&7][(m1x)&7][(m1y)&7] = 1; \ + visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\ h->mc.memcpy_aligned( pix, pix0[i0], bs ); \ if( i_weight == 32 ) \ h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \ @@ -837,7 +852,8 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight int bm1y = m1->mv[1], om1y = bm1y; int bcost = COST_MAX; int pass = 0; - uint8_t visited[8][8][8][8]; + /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ + uint8_t visited[8][8][8]; h->mc.memzero_aligned( visited, sizeof(visited) ); BIME_CACHE( 0, 0 ); @@ -898,8 +914,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight if( satd <= bsatd * SATD_THRESH )\ { \ int cost; \ - cache_mv[0] = cache_mv2[0] = mx; \ - cache_mv[1] = cache_mv2[1] = my; \ + *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \ cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ @@ -937,7 +952,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 ) p_cost_mvx = m->p_cost_mv - pmx; p_cost_mvy = m->p_cost_mv - pmy; COST_MV_SATD( bmx, bmy, bsatd ); - COST_MV_RD( bmx, bmy, 0, 0, 0); + COST_MV_RD( bmx, bmy, 0, 0, 0 ); /* check the predicted mv */ if( (bmx != pmx || bmy != pmy)