From: Dylan Yudaken Date: Thu, 12 Nov 2009 15:03:46 +0000 (-0800) Subject: Various weightp fixes X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=45b28315b47759f29fd1605814ea361990c00dea;p=libx264 Various weightp fixes Make weightp results match in threaded vs non-threaded mode. Fix two-pass with slow-firstpass. --- diff --git a/common/macroblock.c b/common/macroblock.c index 4468f4b8..ca2b1327 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -726,7 +726,7 @@ int x264_macroblock_cache_init( x264_t *h ) if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) { - // only need buffer for lookahead thread + // only need buffer for lookahead if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] ) { // Fake analysis only works on lowres diff --git a/encoder/analyse.h b/encoder/analyse.h index cdf0d2e6..8fc2738a 100644 --- a/encoder/analyse.h +++ b/encoder/analyse.h @@ -32,7 +32,7 @@ void x264_slicetype_decide( x264_t *h ); void x264_slicetype_analyse( x264_t *h, int keyframe ); int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w ); -void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lowres, int b_lookahead ); +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ); int x264_lookahead_init( x264_t *h, int i_slicetype_length ); int x264_lookahead_is_empty( x264_t *h ); diff --git a/encoder/encoder.c b/encoder/encoder.c index 02ac5381..b3e71985 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -851,7 +851,8 @@ x264_t *x264_encoder_open( x264_param_t *param ) || h->param.rc.i_rc_method == X264_RC_CRF || h->param.i_bframe_adaptive || h->param.i_scenecut_threshold - || h->param.rc.b_mb_tree ); + || h->param.rc.b_mb_tree + || h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ); h->frames.b_have_lowres |= h->param.rc.b_stat_read && h->param.rc.i_vbv_buffer_size > 0; h->frames.b_have_sub8x8_esa = !!(h->param.analyse.inter & X264_ANALYSE_PSUB8x8); @@ -1327,8 +1328,6 @@ static inline void x264_reference_build_list( x264_t *h, int i_poc ) w[1].weightfn = w[2].weightfn = NULL; if( h->param.rc.b_stat_read ) x264_ratecontrol_set_weights( h, h->fenc ); - else if( h->param.i_threads == 1 ) - x264_weights_analyse( h, h->fenc, h->fref0[0], 0, 0 ); if( !h->fenc->weight[0][0].weightfn ) { diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index 1b90e59a..912ba541 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -326,7 +326,7 @@ int x264_reference_build_list_optimal( x264_t *h ) memcpy( frames, h->fref0, sizeof(frames) ); memcpy( refcount, rce->refcount, sizeof(refcount) ); memcpy( weights, h->fenc->weight, sizeof(weights) ); - memset( h->fenc->weight, 0, sizeof(h->fenc->weight) ); + memset( &h->fenc->weight[1][0], 0, sizeof(x264_weight_t[15][3]) ); /* For now don't reorder ref 0; it seems to lower quality in most cases due to skips. */ @@ -1333,7 +1333,7 @@ int x264_ratecontrol_end( x264_t *h, int bits ) if( h->sh.weight[0][0].weightfn ) { - if( fprintf( rc->p_stat_file_out, "w:%d,%d,%d", h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 ) + if( fprintf( rc->p_stat_file_out, "w:%"PRId32",%"PRId32",%"PRId32, h->sh.weight[0][0].i_denom, h->sh.weight[0][0].i_scale, h->sh.weight[0][0].i_offset ) < 0 ) goto fail; } diff --git a/encoder/slicetype.c b/encoder/slicetype.c index dfd8ae10..9c6d2829 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -85,70 +85,63 @@ static NOINLINE void weights_plane_analyse( x264_t *h, uint8_t *plane, int width (dst)[3] = &(src)[3][i_pel_offset]; \ } -static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest, int b_lowres ) +static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest ) { - uint8_t **ref_planes = b_lowres ? ref->lowres : ref->filtered; int ref0_distance = fenc->i_frame - ref->i_frame - 1; /* Note: this will never run during lookahead as weights_analyse is only called if no * motion search has been done. */ - if( h->frames.b_have_lowres && fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF - && ( h->param.analyse.i_subpel_refine || h->param.i_threads > 1 )) + if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF ) { uint8_t *src[4]; - int i_stride = b_lowres ? fenc->i_stride_lowres : fenc->i_stride[0]; - int i_lines = b_lowres ? fenc->i_lines_lowres : fenc->i_lines[0]; - int i_width = b_lowres ? fenc->i_width_lowres : fenc->i_width[0]; + int i_stride = fenc->i_stride_lowres; + int i_lines = fenc->i_lines_lowres; + int i_width = fenc->i_width_lowres; int i_mb_xy = 0; - int mbsizeshift = b_lowres ? 3 : 4; - int mbsize = 1 << mbsizeshift; int x,y; int i_pel_offset = 0; - for( y = 0; y < i_lines; y += mbsize, i_pel_offset = y*i_stride ) - for( x = 0; x < i_width; x += mbsize, i_mb_xy++, i_pel_offset += mbsize ) + for( y = 0; y < i_lines; y += 8, i_pel_offset = y*i_stride ) + for( x = 0; x < i_width; x += 8, i_mb_xy++, i_pel_offset += 8 ) { uint8_t *pix = &dest[ i_pel_offset ]; - int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0] << !b_lowres; - int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1] << !b_lowres; - LOAD_HPELS_LUMA( src, ref_planes ); + int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0]; + int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1]; + LOAD_HPELS_LUMA( src, ref->lowres ); h->mc.mc_luma( pix, i_stride, src, i_stride, - mvx, mvy, mbsize, mbsize, weight_none ); + mvx, mvy, 8, 8, weight_none ); } x264_emms(); return dest; } x264_emms(); - return ref_planes[0]; + return ref->lowres[0]; } #undef LOAD_HPELS_LUMA -static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, uint8_t *src, x264_weight_t *w, int b_lowres ) +static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, uint8_t *src, x264_weight_t *w ) { int x, y; unsigned int cost = 0; - int mbsize = b_lowres ? 8 : 16; - int pixelsize = mbsize == 8 ? PIXEL_8x8 : PIXEL_16x16; - int i_stride = b_lowres ? fenc->i_stride_lowres : fenc->i_stride[0]; - int i_lines = b_lowres ? fenc->i_lines_lowres : fenc->i_lines[0]; - int i_width = b_lowres ? fenc->i_width_lowres : fenc->i_width[0]; - uint8_t *fenc_plane = b_lowres ? fenc->lowres[0] : fenc->plane[0]; - ALIGNED_ARRAY_16( uint8_t, buf,[16*16] ); + int i_stride = fenc->i_stride_lowres; + int i_lines = fenc->i_lines_lowres; + int i_width = fenc->i_width_lowres; + uint8_t *fenc_plane = fenc->lowres[0]; + ALIGNED_ARRAY_16( uint8_t, buf, [8*8] ); int pixoff = 0; int i_mb = 0; if( w ) - for( y = 0; y < i_lines; y += mbsize, pixoff = y*i_stride ) - for( x = 0; x < i_width; x += mbsize, i_mb++, pixoff += mbsize) + for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) + for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8) { - w->weightfn[mbsize>>2]( buf, 16, &src[pixoff], i_stride, w, mbsize ); - cost += X264_MIN( h->pixf.mbcmp[pixelsize]( buf, 16, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); + w->weightfn[8>>2]( buf, 8, &src[pixoff], i_stride, w, 8 ); + cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( buf, 8, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); } else - for( y = 0; y < i_lines; y += mbsize, pixoff = y*i_stride ) - for( x = 0; x < i_width; x+=mbsize, i_mb++, pixoff += mbsize ) - cost += X264_MIN( h->pixf.mbcmp[pixelsize]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); + for( y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) + for( x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 ) + cost += X264_MIN( h->pixf.mbcmp[PIXEL_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride ), fenc->i_intra_cost[i_mb] ); - int lambda = b_lowres ? 1 : 4; if( w ) { int numslices; @@ -160,13 +153,14 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui numslices = 1; // FIXME still need to calculate for --slice-max-size // Multiply by 2 as there will be a duplicate. 10 bits added as if there is a weighted frame, then an additional duplicate is used. - cost += lambda * numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) ); + // Since using lowres frames, assume lambda = 1. + cost += numslices * ( 10 + 2 * ( bs_size_ue( w[0].i_denom ) + bs_size_se( w[0].i_scale ) + bs_size_se( w[0].i_offset ) ) ); } x264_emms(); return cost; } -void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lowres, int b_lookahead ) +void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead ) { unsigned int fenc_sum, ref_sum; float fenc_mean, ref_mean; @@ -209,8 +203,8 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int x264_lowres_context_init( h, &a ); x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 ); } - uint8_t *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0], b_lowres ); - origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0, b_lowres ); + uint8_t *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] ); + origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0 ); if( !minscore ) return; @@ -221,13 +215,13 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int for( i_off = offset_search; i_off <= offset_search+!b_lookahead; i_off++ ) { SET_WEIGHT( weights[0], 1, minscale, mindenom, i_off ); - unsigned int s = x264_weight_cost( h, fenc, mcbuf, &weights[0], b_lowres ); + unsigned int s = x264_weight_cost( h, fenc, mcbuf, &weights[0] ); COPY3_IF_LT( minscore, s, minoff, i_off, found, 1 ); } x264_emms(); /* FIXME: More analysis can be done here on SAD vs. SATD termination. */ - if( !found || (minscale == 1<= fenc->i_width[0] * fenc->i_lines[0] * (b_lowres ? 2 : 8) ) + if( !found || (minscale == 1<= fenc->i_width[0] * fenc->i_lines[0] * 2 ) { SET_WEIGHT( weights[0], 0, 1, 0, 0 ); return; @@ -521,7 +515,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a, if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART || h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 ) { - x264_weights_analyse( h, frames[b], frames[p0], 1, 1 ); + x264_weights_analyse( h, frames[b], frames[p0], 1 ); w = frames[b]->weight[0]; } frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0; @@ -1268,12 +1262,9 @@ void x264_slicetype_decide( x264_t *h ) } /* Analyse for weighted P frames */ - if( h->lookahead->next.list[bframes]->i_type == X264_TYPE_P ) - { - memset( h->lookahead->next.list[bframes]->weight, 0, sizeof(h->lookahead->next.list[bframes]->weight) ); - if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART && h->param.i_threads > 1 ) - x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 1, 0 ); - } + if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P + && h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) + x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 ); /* shift sequence to coded order. use a small temporary list to avoid shifting the entire next buffer around */