From dba0e5a2e089cd675e201cdf4e3358eb7a0e22cc Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sun, 15 Jun 2008 11:50:17 -0600 Subject: [PATCH] Cosmetics and loop unrolling GCC is not very good at loop unrolling in cases where it can perform constant propagation, so the unrolling unfortunately has to be done manually. --- common/cabac.h | 2 +- common/frame.c | 225 +++++++++++++++++++++----------------------- common/macroblock.c | 33 ++++--- 3 files changed, 128 insertions(+), 132 deletions(-) diff --git a/common/cabac.h b/common/cabac.h index bfdc5b3f..8289d2b0 100644 --- a/common/cabac.h +++ b/common/cabac.h @@ -38,7 +38,7 @@ typedef struct uint8_t *p; uint8_t *p_end; - /* aligned for aligned_memcpy starting here */ + /* aligned for memcpy_aligned starting here */ DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision() /* context */ diff --git a/common/frame.c b/common/frame.c index 99565271..214d4fc7 100644 --- a/common/frame.c +++ b/common/frame.c @@ -594,7 +594,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x; const int b_8x8_transform = h->mb.mb_transform_size[mb_xy]; const int i_edge_end = (h->mb.type[mb_xy] == P_SKIP) ? 1 : 4; - int i_edge, i_dir; + int i_edge; int i_pix_y[3] = { 16*mb_y*h->fdec->i_stride[0] + 16*mb_x, 8*mb_y*h->fdec->i_stride[1] + 8*mb_x, @@ -610,125 +610,116 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) /* i_dir == 0 -> vertical edge * i_dir == 1 -> horizontal edge */ - for( i_dir = 0; i_dir < 2; i_dir++ ) - { - int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0)); - int i_qp, i_qpn; - - for( i_edge = i_start; i_edge < i_edge_end; i_edge++ ) - { - int mbn_xy, mbn_8x8, mbn_4x4; - int bS[4]; /* filtering strength */ - - if( b_8x8_transform && (i_edge&1) ) - continue; - - mbn_xy = i_edge > 0 ? mb_xy : ( i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride ); - mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 ); - mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 ); - - if( b_interlaced && i_edge == 0 && i_dir == 1 ) - { - mbn_xy -= h->mb.i_mb_stride; - mbn_8x8 -= 2 * s8x8; - mbn_4x4 -= 4 * s4x4; - } - - /* *** Get bS for each 4px for the current edge *** */ - if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) ) - { - bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 ); - } - else - { - int i; - for( i = 0; i < 4; i++ ) - { - int x = i_dir == 0 ? i_edge : i; - int y = i_dir == 0 ? i : i_edge; - int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03; - int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03; - - if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 || - h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 ) - { - bS[i] = 2; - } - else - { - /* FIXME: A given frame may occupy more than one position in - * the reference list. So we should compare the frame numbers, - * not the indices in the ref list. - * No harm yet, as we don't generate that case.*/ - - int i8p= mb_8x8+(x/2)+(y/2)*s8x8; - int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8; - int i4p= mb_4x4+x+y*s4x4; - int i4q= mbn_4x4+xn+yn*s4x4; - int l; - - bS[i] = 0; - - for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ ) - { - if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] || - abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 || - abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit ) - { - bS[i] = 1; - break; - } - } - } - } - } - /* *** filter *** */ - /* Y plane */ - i_qp = h->mb.qp[mb_xy]; - i_qpn= h->mb.qp[mbn_xy]; - - if( i_dir == 0 ) - { - /* vertical edge */ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge], - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0, - h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra ); - if( !(i_edge & 1) ) - { - /* U/V planes */ - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1; - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge], - i_stride2[1], bS, i_qpc, 1, - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra ); - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge], - i_stride2[2], bS, i_qpc, 1, - h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra ); - } - } - else - { - /* horizontal edge */ - deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]], - i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0, - h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra ); - /* U/V planes */ - if( !(i_edge & 1) ) - { - int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + - i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1; - deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]], - i_stride2[1], bS, i_qpc, 1, - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra ); - deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]], - i_stride2[2], bS, i_qpc, 1, - h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra ); - } - } - } + #define deblock_dir(i_dir)\ + {\ + int i_start = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\ + int i_qp, i_qpn;\ + for( i_edge = i_start; i_edge < i_edge_end; i_edge++ )\ + {\ + int mbn_xy, mbn_8x8, mbn_4x4;\ + int bS[4]; /* filtering strength */\ + if( b_8x8_transform && (i_edge&1) )\ + continue;\ + mbn_xy = i_edge > 0 ? mb_xy : ( i_dir == 0 ? mb_xy - 1 : mb_xy - h->mb.i_mb_stride );\ + mbn_8x8 = i_edge > 0 ? mb_8x8 : ( i_dir == 0 ? mb_8x8 - 2 : mb_8x8 - 2 * s8x8 );\ + mbn_4x4 = i_edge > 0 ? mb_4x4 : ( i_dir == 0 ? mb_4x4 - 4 : mb_4x4 - 4 * s4x4 );\ + if( b_interlaced && i_edge == 0 && i_dir == 1 )\ + {\ + mbn_xy -= h->mb.i_mb_stride;\ + mbn_8x8 -= 2 * s8x8;\ + mbn_4x4 -= 4 * s4x4;\ + }\ + /* *** Get bS for each 4px for the current edge *** */\ + if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy] ) )\ + bS[0] = bS[1] = bS[2] = bS[3] = ( i_edge == 0 && !(b_interlaced && i_dir) ? 4 : 3 );\ + else\ + {\ + int i;\ + for( i = 0; i < 4; i++ )\ + {\ + int x = i_dir == 0 ? i_edge : i;\ + int y = i_dir == 0 ? i : i_edge;\ + int xn = (x - (i_dir == 0 ? 1 : 0 ))&0x03;\ + int yn = (y - (i_dir == 0 ? 0 : 1 ))&0x03;\ + if( h->mb.non_zero_count[mb_xy][block_idx_xy[x][y]] != 0 ||\ + h->mb.non_zero_count[mbn_xy][block_idx_xy[xn][yn]] != 0 )\ + {\ + bS[i] = 2;\ + }\ + else\ + {\ + /* FIXME: A given frame may occupy more than one position in\ + * the reference list. So we should compare the frame numbers,\ + * not the indices in the ref list.\ + * No harm yet, as we don't generate that case.*/\ + int i8p= mb_8x8+(x/2)+(y/2)*s8x8;\ + int i8q= mbn_8x8+(xn/2)+(yn/2)*s8x8;\ + int i4p= mb_4x4+x+y*s4x4;\ + int i4q= mbn_4x4+xn+yn*s4x4;\ + int l;\ + bS[i] = 0;\ + for( l = 0; l < 1 + (h->sh.i_type == SLICE_TYPE_B); l++ )\ + {\ + if( h->mb.ref[l][i8p] != h->mb.ref[l][i8q] ||\ + abs( h->mb.mv[l][i4p][0] - h->mb.mv[l][i4q][0] ) >= 4 ||\ + abs( h->mb.mv[l][i4p][1] - h->mb.mv[l][i4q][1] ) >= mvy_limit )\ + {\ + bS[i] = 1;\ + break;\ + }\ + }\ + }\ + }\ + }\ + /* *** filter *** */\ + /* Y plane */\ + i_qp = h->mb.qp[mb_xy];\ + i_qpn= h->mb.qp[mbn_xy];\ + if( i_dir == 0 )\ + {\ + /* vertical edge */\ + deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge],\ + i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_h_luma, h->loopf.deblock_h_luma_intra );\ + if( !(i_edge & 1) )\ + {\ + /* U/V planes */\ + int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ + i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ + deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge],\ + i_stride2[1], bS, i_qpc, 1,\ + h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ + deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge],\ + i_stride2[2], bS, i_qpc, 1,\ + h->loopf.deblock_h_chroma, h->loopf.deblock_h_chroma_intra );\ + }\ + }\ + else\ + {\ + /* horizontal edge */\ + deblock_edge( h, &h->fdec->plane[0][i_pix_y[0] + 4*i_edge*i_stride2[0]],\ + i_stride2[0], bS, (i_qp+i_qpn+1) >> 1, 0,\ + h->loopf.deblock_v_luma, h->loopf.deblock_v_luma_intra );\ + /* U/V planes */\ + if( !(i_edge & 1) )\ + {\ + int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] +\ + i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1;\ + deblock_edge( h, &h->fdec->plane[1][i_pix_y[1] + 2*i_edge*i_stride2[1]],\ + i_stride2[1], bS, i_qpc, 1,\ + h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ + deblock_edge( h, &h->fdec->plane[2][i_pix_y[2] + 2*i_edge*i_stride2[2]],\ + i_stride2[2], bS, i_qpc, 1,\ + h->loopf.deblock_v_chroma, h->loopf.deblock_v_chroma_intra );\ + }\ + }\ + }\ } + deblock_dir(0); + deblock_dir(1); + /* next mb */ if( !b_interlaced || (mb_y&1) ) mb_x++; diff --git a/common/macroblock.c b/common/macroblock.c index 182b5e0d..df912ee3 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -467,16 +467,16 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx ) { const int x = 2*(idx%2); const int y = 2*(idx/2); - int l; x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] ); x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] ); - for( l = 0; l < 2; l++ ) - { - *(uint64_t*)h->mb.cache.mv[l][x264_scan8[idx*4]] = - *(uint64_t*)h->mb.cache.direct_mv[l][x264_scan8[idx*4]]; - *(uint64_t*)h->mb.cache.mv[l][x264_scan8[idx*4]+8] = - *(uint64_t*)h->mb.cache.direct_mv[l][x264_scan8[idx*4]+8]; - } + *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]] = + *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]]; + *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]+8] = + *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8]; + *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]] = + *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]]; + *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]+8] = + *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8]; } #define FIXED_SCALE 256 @@ -979,7 +979,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb int ref_pix_offset[2] = { i_pix_offset, i_pix_offset }; const uint8_t *intra_fdec = &h->mb.intra_border_backup[i_mb_y & h->sh.b_mbaff][i][i_mb_x*16>>!!i]; x264_frame_t **fref[2] = { h->fref0, h->fref1 }; - int j, k, l; + int j, k; if( h->mb.b_interlaced ) ref_pix_offset[1] += (1-2*(i_mb_y&1)) * i_stride; h->mb.pic.i_stride[i] = i_stride2; @@ -992,16 +992,21 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int i_mb for( j = 0; j < w; j++ ) h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2]; } - for( l=0; l<2; l++ ) + for( j = 0; j < h->mb.pic.i_fref[0]; j++ ) { - for( j=0; jmb.pic.i_fref[l]; j++ ) + h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + if( i == 0 ) + for( k = 1; k < 4; k++ ) + h->mb.pic.p_fref[0][j][k] = &fref[0][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; + } + if( h->sh.i_type == SLICE_TYPE_B ) + for( j = 0; j < h->mb.pic.i_fref[1]; j++ ) { - h->mb.pic.p_fref[l][j][i==0 ? 0:i+3] = &fref[l][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &fref[1][j >> h->mb.b_interlaced]->plane[i][ref_pix_offset[j&1]]; if( i == 0 ) for( k = 1; k < 4; k++ ) - h->mb.pic.p_fref[l][j][k] = &fref[l][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; + h->mb.pic.p_fref[1][j][k] = &fref[1][j >> h->mb.b_interlaced]->filtered[k][ref_pix_offset[j&1]]; } - } } void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) -- 2.40.0