From 1eb8b071a232873e40e001ec7379a917265bf372 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sat, 9 Aug 2008 09:34:37 -0600 Subject: [PATCH] Improve subme7 at low QPs and add subme7 support in lossless mode --- encoder/analyse.c | 7 +-- encoder/encoder.c | 1 - encoder/macroblock.c | 118 +++++++++++++++++++++++++------------------ encoder/me.c | 4 +- encoder/me.h | 3 +- encoder/rdo.c | 42 +++++++-------- 6 files changed, 99 insertions(+), 76 deletions(-) diff --git a/encoder/analyse.c b/encoder/analyse.c index 270b90ae..5362ba13 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -782,7 +782,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) uint8_t *p_dst = h->mb.pic.p_fdec[0]; int i, j, idx, x, y; - int i_max, i_satd, i_best, i_mode, i_thresh; + int i_max, i_mode, i_thresh; + uint64_t i_satd, i_best; int i_pred_mode; int predict_mode[9]; h->mb.i_skip_intra = 0; @@ -810,7 +811,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) for( idx = 0; idx < 16; idx++ ) { uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx]; - i_best = COST_MAX; + i_best = COST_MAX64; i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx ); @@ -860,7 +861,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) int j; i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8; - i_best = COST_MAX; + i_best = COST_MAX64; i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); x = idx&1; y = idx>>1; diff --git a/encoder/encoder.c b/encoder/encoder.c index 76fd1454..cf699700 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -410,7 +410,6 @@ static int x264_validate_parameters( x264_t *h ) h->param.analyse.i_trellis = 0; h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; - h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 6 ); } if( h->param.rc.i_rc_method == X264_RC_CQP ) { diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 51c56840..8c14302f 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -747,76 +747,96 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) uint8_t *p_fenc = h->mb.pic.p_fenc[0] + (i8&1)*8 + (i8>>1)*8*FENC_STRIDE; uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE; int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate; - int nnz8x8; + int nnz8x8 = 0; int ch; x264_mb_mc_8x8( h, i8 ); - if( h->mb.b_transform_8x8 ) + if( h->mb.b_lossless ) { - DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); - h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); - h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); - h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); - - if( b_decimate ) - nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 ); - else - nnz8x8 = array_non_zero( dct8x8 ); - - if( nnz8x8 ) + int i4; + for( i4 = i8*4; i4 < i8*4+4; i4++ ) { - h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp ); - h->dctf.add8x8_idct8( p_fdec, dct8x8 ); + h->zigzagf.sub_4x4( h->dct.luma4x4[i4], + h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4], + h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] ); + nnz8x8 |= array_non_zero( h->dct.luma4x4[i4] ); + } + for( ch = 0; ch < 2; ch++ ) + { + p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE; + p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE; + h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec ); + h->dct.luma4x4[16+i8+ch*4][0] = 0; } } else { - int i4; - DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); - h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); - for( i4 = 0; i4 < 4; i4++ ) - h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - for( i4 = 0; i4 < 4; i4++ ) - h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] ); - - if( b_decimate ) + if( h->mb.b_transform_8x8 ) { - int i_decimate_8x8 = 0; - for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ ) - i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 ); - nnz8x8 = 4 <= i_decimate_8x8; + DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); + h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); + h->quantf.quant_8x8( dct8x8, h->quant8_mf[CQM_8PY][i_qp], h->quant8_bias[CQM_8PY][i_qp] ); + h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); + + if( b_decimate ) + nnz8x8 = 4 <= x264_mb_decimate_score( h->dct.luma8x8[i8], 64 ); + else + nnz8x8 = array_non_zero( dct8x8 ); + + if( nnz8x8 ) + { + h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp ); + h->dctf.add8x8_idct8( p_fdec, dct8x8 ); + } } else - nnz8x8 = array_non_zero( dct4x4 ); - - if( nnz8x8 ) { + int i4; + DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); + h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); for( i4 = 0; i4 < 4; i4++ ) - h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp ); - h->dctf.add8x8_idct( p_fdec, dct4x4 ); + h->quantf.quant_4x4( dct4x4[i4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); + for( i4 = 0; i4 < 4; i4++ ) + h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] ); + + if( b_decimate ) + { + int i_decimate_8x8 = 0; + for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ ) + i_decimate_8x8 += x264_mb_decimate_score( h->dct.luma4x4[i8*4+i4], 16 ); + nnz8x8 = 4 <= i_decimate_8x8; + } + else + nnz8x8 = array_non_zero( dct4x4 ); + + if( nnz8x8 ) + { + for( i4 = 0; i4 < 4; i4++ ) + h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp ); + h->dctf.add8x8_idct( p_fdec, dct4x4 ); + } } - } - i_qp = h->mb.i_chroma_qp; + i_qp = h->mb.i_chroma_qp; - for( ch = 0; ch < 2; ch++ ) - { - DECLARE_ALIGNED_16( int16_t dct4x4[4][4] ); - p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE; - p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE; - - h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); - h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); - h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 ); - h->dct.luma4x4[16+i8+ch*4][0] = 0; - if( array_non_zero( dct4x4 ) ) + for( ch = 0; ch < 2; ch++ ) { - h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp ); - h->dctf.add4x4_idct( p_fdec, dct4x4 ); + DECLARE_ALIGNED_16( int16_t dct4x4[4][4] ); + p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE; + p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE; + + h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); + h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); + h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 ); + h->dct.luma4x4[16+i8+ch*4][0] = 0; + if( array_non_zero( dct4x4 ) ) + { + h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp ); + h->dctf.add4x4_idct( p_fdec, dct4x4 ); + } } } - h->mb.i_cbp_luma &= ~(1 << i8); h->mb.i_cbp_luma |= nnz8x8 << i8; h->mb.i_cbp_chroma = 0x02; diff --git a/encoder/me.c b/encoder/me.c index d4f3eaa6..f4f7e502 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -913,7 +913,7 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight { \ if( satd <= bsatd * SATD_THRESH )\ { \ - int cost; \ + uint64_t cost; \ *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \ cost = x264_rd_cost_part( h, i_lambda2, i8, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ @@ -934,7 +934,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 ) const int i_pixel = m->i_pixel; DECLARE_ALIGNED_16( uint8_t pix[16*16] ); - int bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX; + uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64; int bmx = m->mv[0]; int bmy = m->mv[1]; int omx = bmx; diff --git a/encoder/me.h b/encoder/me.h index 34806e12..655c2a14 100644 --- a/encoder/me.h +++ b/encoder/me.h @@ -25,6 +25,7 @@ #define X264_ME_H #define COST_MAX (1<<28) +#define COST_MAX64 (1ULL<<60) typedef struct { @@ -54,7 +55,7 @@ static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], i void x264_me_refine_qpel( x264_t *h, x264_me_t *m ); void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 ); int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ); -int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ); +uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ); extern uint16_t *x264_cost_mv_fpel[52][4]; diff --git a/encoder/rdo.c b/encoder/rdo.c index 76bf57be..650b5ae5 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -101,9 +101,11 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 ) return i_ssd + i_bits; } -int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ) +/* subpartition RD functions use 8 bits more precision to avoid large rounding errors at low QPs */ + +uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ) { - int i_ssd, i_bits; + uint64_t i_ssd, i_bits; if( i_pixel == PIXEL_16x16 ) { @@ -128,19 +130,19 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ) x264_cabac_t cabac_tmp; COPY_CABAC; x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel ); - i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; + i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else { - i_bits = ( x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2 + 128 ) >> 8; + i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2; } - return i_ssd + i_bits; + return (i_ssd<<8) + i_bits; } -int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode ) +uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode ) { - int i_ssd, i_bits; + uint64_t i_ssd, i_bits; x264_mb_encode_i8x8( h, i8, h->mb.i_qp ); i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 ); @@ -150,19 +152,19 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode ) x264_cabac_t cabac_tmp; COPY_CABAC; x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode ); - i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; + i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else { - i_bits = ( x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2 + 128 ) >> 8; + i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2; } - return i_ssd + i_bits; + return (i_ssd<<8) + i_bits; } -int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode ) +uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode ) { - int i_ssd, i_bits; + uint64_t i_ssd, i_bits; x264_mb_encode_i4x4( h, i4, h->mb.i_qp ); i_ssd = ssd_plane( h, PIXEL_4x4, 0, block_idx_x[i4]*4, block_idx_y[i4]*4 ); @@ -172,19 +174,19 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode ) x264_cabac_t cabac_tmp; COPY_CABAC; x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode ); - i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; + i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else { - i_bits = ( x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2 + 128 ) >> 8; + i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2; } - return i_ssd + i_bits; + return (i_ssd<<8) + i_bits; } -int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) +uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) { - int i_ssd, i_bits; + uint64_t i_ssd, i_bits; if( b_dct ) x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp ); @@ -198,14 +200,14 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) x264_cabac_t cabac_tmp; COPY_CABAC; x264_i8x8_chroma_size_cabac( h, &cabac_tmp ); - i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16; + i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else { - i_bits = ( x264_i8x8_chroma_size_cavlc( h ) * i_lambda2 + 128 ) >> 8; + i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2; } - return i_ssd + i_bits; + return (i_ssd<<8) + i_bits; } /**************************************************************************** * Trellis RD quantization -- 2.40.0