From: Fiona Glaser Date: Wed, 13 Jul 2011 00:27:18 +0000 (-0700) Subject: Optimize frame_deblock_row X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8ae69dbc7ec37e157a3890c21ec4904973e800f9;p=libx264 Optimize frame_deblock_row --- diff --git a/common/deblock.c b/common/deblock.c index 812f4ea9..a14ecc27 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -452,10 +452,10 @@ void deblock_strength_mbaff_c( uint8_t nnz_cache[X264_SCAN8_SIZE], int8_t ref[2] } } -static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) +static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_inter_t pf_inter ) { - int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; - int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; + int index_a = i_qp + a; + int index_b = i_qp + b; int alpha = alpha_table(index_a) << (BIT_DEPTH-8); int beta = beta_table(index_b) << (BIT_DEPTH-8); int8_t tc[4]; @@ -471,10 +471,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS pf_inter( pix, i_stride, alpha, beta, tc ); } -static inline void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) +static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_intra_t pf_intra ) { - int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset; - int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset; + int index_a = i_qp + a; + int index_b = i_qp + b; int alpha = alpha_table(index_a) << (BIT_DEPTH-8); int beta = beta_table(index_b) << (BIT_DEPTH-8); @@ -487,9 +487,13 @@ static inline void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint void x264_frame_deblock_row( x264_t *h, int mb_y ) { int b_interlaced = SLICE_MBAFF; - int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->param.analyse.i_chroma_qp_offset ); + int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET; + int b = h->sh.i_beta_offset - QP_BD_OFFSET; + int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); int stridey = h->fdec->i_stride[0]; int strideuv = h->fdec->i_stride[1]; + int chroma444 = CHROMA444; + intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1; for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced ) { @@ -502,38 +506,38 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x]; pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; - pixel *pixuv = h->fdec->plane[1] + (8<fdec->plane[2] - h->fdec->plane[1] : 1; + pixel *pixuv = h->fdec->plane[1] + (8<mb.qp[mb_xy]; int qpc = h->chroma_qp_table[qp]; - int first_edge_only = h->mb.type[mb_xy] == P_SKIP || qp <= qp_thresh; + int first_edge_only = (h->mb.partition[mb_xy] == D_16x16 && !h->mb.cbp[mb_xy] && !intra_cur) || qp <= qp_thresh; #define FILTER( intra, dir, edge, qp, chroma_qp )\ do\ {\ deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\ - stride2y, bs[dir][edge], qp, 0,\ + stride2y, bs[dir][edge], qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ - if( CHROMA444 )\ + if( chroma444 )\ {\ deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\ - stride2uv, bs[dir][edge], chroma_qp, 0,\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\ - stride2uv, bs[dir][edge], chroma_qp, 0,\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ }\ else if( !(edge & 1) )\ deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\ - stride2uv, bs[dir][edge], chroma_qp, 1,\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\ h->loopf.deblock_chroma##intra[dir] );\ } while(0) @@ -544,53 +548,52 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int luma_qp[2]; int chroma_qp[2]; int left_qp[2]; - int current_qp = h->mb.qp[mb_xy]; x264_deblock_inter_t luma_deblock = deblock_v_luma_mbaff_c; - x264_deblock_inter_t chroma_deblock = CHROMA444 ? deblock_v_luma_mbaff_c : deblock_v_chroma_mbaff_c; + x264_deblock_inter_t chroma_deblock = chroma444 ? deblock_v_luma_mbaff_c : deblock_v_chroma_mbaff_c; x264_deblock_intra_t luma_intra_deblock = deblock_v_luma_intra_mbaff_c; - x264_deblock_intra_t chroma_intra_deblock = CHROMA444 ? deblock_v_luma_intra_mbaff_c : deblock_v_chroma_intra_mbaff_c; - int c = CHROMA444 ? 0 : 1; + x264_deblock_intra_t chroma_intra_deblock = chroma444 ? deblock_v_luma_intra_mbaff_c : deblock_v_chroma_intra_mbaff_c; + int c = chroma444 ? 0 : 1; left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]]; - luma_qp[0] = (current_qp + left_qp[0] + 1) >> 1; - chroma_qp[0] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[0]] + 1) >> 1; - if( bs[0][0][0] == 4) + luma_qp[0] = (qp + left_qp[0] + 1) >> 1; + chroma_qp[0] = (qpc + h->chroma_qp_table[left_qp[0]] + 1) >> 1; + if( bs[0][0][0] == 4 ) { - deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], 0, luma_intra_deblock ); - deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_intra_deblock ); - deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_intra_deblock ); + deblock_edge_intra( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_intra_deblock ); + deblock_edge_intra( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); + deblock_edge_intra( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_intra_deblock ); } else { - deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], 0, luma_deblock ); - deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_deblock ); - deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], c, chroma_deblock ); + deblock_edge( h, pixy, 2*stridey, bs[0][0], luma_qp[0], a, b, 0, luma_deblock ); + deblock_edge( h, pixuv, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); + deblock_edge( h, pixuv + uvdiff, 2*strideuv, bs[0][0], chroma_qp[0], a, b, c, chroma_deblock ); } int offy = MB_INTERLACED ? 4 : 0; int offuv = MB_INTERLACED ? 3 : 0; - if( CHROMA444 ) offuv = offy; + if( chroma444 ) offuv = offy; left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]]; - luma_qp[1] = (current_qp + left_qp[1] + 1) >> 1; - chroma_qp[1] = (h->chroma_qp_table[current_qp] + h->chroma_qp_table[left_qp[1]] + 1) >> 1; + luma_qp[1] = (qp + left_qp[1] + 1) >> 1; + chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1; if( bs[0][4][0] == 4) { - deblock_edge_intra( h, pixy + (stridey<mb.qp[h->mb.i_mb_xy-1]; int qp_left = (qp + qpl + 1) >> 1; - int qpc_left = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpl] + 1) >> 1; + int qpc_left = (qpc + h->chroma_qp_table[qpl] + 1) >> 1; int intra_left = IS_INTRA( h->mb.type[h->mb.i_mb_xy-1] ); if( intra_cur || intra_left ) @@ -616,24 +619,24 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) { int qpt = h->mb.qp[mbn_xy]; int qp_top = (qp + qpt + 1) >> 1; - int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1; // deblock the first horizontal edge of the even rows, then the first horizontal edge of the odd rows - deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, 0, deblock_v_luma_c ); - if( CHROMA444 ) + deblock_edge( h, pixy + j*stridey, 2* stridey, bs[1][4*j], qp_top, a, b, 0, deblock_v_luma_c ); + if( chroma444 ) { - deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 0, deblock_v_luma_c ); - deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 0, deblock_v_luma_c ); + deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, deblock_v_luma_c ); + deblock_edge( h, pixuv + uvdiff + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 0, deblock_v_luma_c ); } else - deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, 1, deblock_v_chroma_c ); + deblock_edge( h, pixuv + j*strideuv, 2*strideuv, bs[1][4*j], qpc_top, a, b, 1, deblock_v_chroma_c ); } } else { int qpt = h->mb.qp[h->mb.i_mb_top_xy]; int qp_top = (qp + qpt + 1) >> 1; - int qpc_top = (h->chroma_qp_table[qp] + h->chroma_qp_table[qpt] + 1) >> 1; + int qpc_top = (qpc + h->chroma_qp_table[qpt] + 1) >> 1; int intra_top = IS_INTRA( h->mb.type[h->mb.i_mb_top_xy] ); if( (!b_interlaced || (!MB_INTERLACED && !h->mb.field[h->mb.i_mb_top_xy])) @@ -671,15 +674,21 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) */ void x264_macroblock_deblock( x264_t *h ) { - int qp_thresh = 15 - X264_MIN( h->sh.i_alpha_c0_offset, h->sh.i_beta_offset ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); + int a = h->sh.i_alpha_c0_offset - QP_BD_OFFSET; + int b = h->sh.i_beta_offset - QP_BD_OFFSET; + int qp_thresh = 15 - X264_MIN( a, b ) - X264_MAX( 0, h->pps->i_chroma_qp_index_offset ); + int intra_cur = IS_INTRA( h->mb.i_type ); int qp = h->mb.i_qp; int qpc = h->mb.i_chroma_qp; - if( qp <= qp_thresh || h->mb.i_type == P_SKIP ) + if( (h->mb.i_partition == D_16x16 && !h->mb.i_cbp_luma && !intra_cur) || qp <= qp_thresh ) return; uint8_t (*bs)[8][4] = h->deblock_strength[h->mb.i_mb_y&1][h->mb.i_mb_x]; - if( IS_INTRA( h->mb.i_type ) ) - memset( bs, 3, 2*8*4*sizeof(uint8_t) ); + if( intra_cur ) + { + memset( &bs[0][1], 3, 3*4*sizeof(uint8_t) ); + memset( &bs[1][1], 3, 3*4*sizeof(uint8_t) ); + } else h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, bs, 4 >> SLICE_MBAFF, h->sh.i_type == SLICE_TYPE_B, h ); @@ -690,15 +699,15 @@ void x264_macroblock_deblock( x264_t *h ) do\ {\ deblock_edge( h, h->mb.pic.p_fdec[0] + 4*edge*(dir?FDEC_STRIDE:1),\ - FDEC_STRIDE, bs[dir][edge], qp, 0,\ + FDEC_STRIDE, bs[dir][edge], qp, a, b, 0,\ h->loopf.deblock_luma[dir] );\ if( CHROMA444 )\ {\ deblock_edge( h, h->mb.pic.p_fdec[1] + 4*edge*(dir?FDEC_STRIDE:1),\ - FDEC_STRIDE, bs[dir][edge], qpc, 0,\ + FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\ h->loopf.deblock_luma[dir] );\ deblock_edge( h, h->mb.pic.p_fdec[2] + 4*edge*(dir?FDEC_STRIDE:1),\ - FDEC_STRIDE, bs[dir][edge], qpc, 0,\ + FDEC_STRIDE, bs[dir][edge], qpc, a, b, 0,\ h->loopf.deblock_luma[dir] );\ }\ } while(0)