From 32bd2d645c63c7cf55a2f9b33e39e63144c3e835 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 15 May 2008 05:41:43 -0600 Subject: [PATCH] force unroll macroblock_load_pic_pointers and a few other minor optimizations --- common/macroblock.c | 78 ++++++++++++++++++++++---------------------- encoder/macroblock.c | 20 ++++-------- 2 files changed, 45 insertions(+), 53 deletions(-) diff --git a/common/macroblock.c b/common/macroblock.c index a68c8152..cd1f9cc7 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -1011,6 +1011,42 @@ static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src ) dst[i*FDEC_STRIDE] = src[i*FDEC_STRIDE]; } +static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb_x, int i_mb_y, int i) +{ + const int w = (i == 0 ? 16 : 8); + const int i_stride = h->fdec->i_stride[i]; + const int i_stride2 = i_stride << h->mb.b_interlaced; + const int i_pix_offset = h->mb.b_interlaced + ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride + : w * (i_mb_x + i_mb_y * i_stride); + int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; + const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i]; + x264_frame_t **fref[2] = { h->fref0, h->fref1 }; + int j, k, l; + if( h->mb.b_interlaced ) + ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride; + h->mb.pic.i_stride[i] = i_stride2; + h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, + &h->fenc->plane[i][i_pix_offset], i_stride2, w ); + memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 ); + if( h->mb.b_interlaced ) + { + const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset]; + for( j = 0; j < w; j++ ) + h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; + } + for( l=0; l<2; l++ ) + { + for( j=0; jmb.pic.i_fref[l]; j++ ) + { + h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( i == 0 ) + for( k = 1; k < 4; k++ ) + h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; + } + } +} + void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) { int i_mb_xy = i_mb_y * h->mb.i_mb_stride + i_mb_x; @@ -1189,45 +1225,9 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) } /* load picture pointers */ - for( i = 0; i < 3; i++ ) - { - const int w = (i == 0 ? 16 : 8); - const int i_stride = h->fdec->i_stride[i]; - const int i_stride2 = i_stride << h->mb.b_interlaced; - const int i_pix_offset = h->mb.b_interlaced - ? w * (i_mb_x + (i_mb_y&~1) * i_stride) + (i_mb_y&1) * i_stride - : w * (i_mb_x + i_mb_y * i_stride); - int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; - const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i]; - x264_frame_t **fref[2] = { h->fref0, h->fref1 }; - int j, k, l; - - if( h->mb.b_interlaced ) - ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride; - - h->mb.pic.i_stride[i] = i_stride2; - - h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE, - &h->fenc->plane[i][i_pix_offset], i_stride2, w ); - memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 ); - if( h->mb.b_interlaced ) - { - const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset]; - for( j = 0; j < w; j++ ) - h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; - } - - for( l=0; l<2; l++ ) - { - for( j=0; jmb.pic.i_fref[l]; j++ ) - { - h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; - if( i == 0 ) - for( k = 1; k < 4; k++ ) - h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; - } - } - } + x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 0 ); + x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 1 ); + x264_macroblock_load_pic_pointers( h, i_mb_x, i_mb_y, 2 ); if( h->fdec->integral ) { diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 01d09bc6..33547146 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -64,7 +64,7 @@ static int x264_mb_decimate_score( int16_t *dct, int i_max ) { int i_run; - if( abs( dct[idx--] ) > 1 ) + if( (unsigned)(dct[idx--] + 1) > 2 ) return 9; i_run = 0; @@ -273,15 +273,9 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) static void x264_macroblock_encode_skip( x264_t *h ) { - int i; h->mb.i_cbp_luma = 0x00; h->mb.i_cbp_chroma = 0x00; - - for( i = 0; i < 16+8; i++ ) - { - h->mb.cache.non_zero_count[x264_scan8[i]] = 0; - } - + memset( h->mb.cache.non_zero_count, 0, X264_SCAN8_SIZE ); /* store cbp */ h->mb.cbp[h->mb.i_mb_xy] = 0; } @@ -500,8 +494,8 @@ void x264_macroblock_encode( x264_t *h ) h->quantf.quant_4x4( dct4x4[idx], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); - - if( b_decimate ) + + if( b_decimate && i_decimate_8x8 <= 6 ) i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[idx], 16 ); } @@ -799,10 +793,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) int i4; DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); - h->quantf.quant_4x4( dct4x4[0], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - h->quantf.quant_4x4( dct4x4[1], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - h->quantf.quant_4x4( dct4x4[2], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - h->quantf.quant_4x4( dct4x4[3], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + for( i4 = 0; i4 < 4; i4++ ) + h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); for( i4 = 0; i4 < 4; i4++ ) h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] ); -- 2.40.0