From 56108cb63848d4a553bccb7389226910f3f25e2e Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Sun, 15 Jun 2008 11:51:36 -0600 Subject: [PATCH] Use aligned memcpy for x264_me_t struct and cosmetics --- encoder/analyse.c | 96 ++++++++++++++++++++++----------------------- encoder/me.h | 2 +- encoder/slicetype.c | 20 ++++------ 3 files changed, 57 insertions(+), 61 deletions(-) diff --git a/encoder/analyse.c b/encoder/analyse.c index 17efe032..9200ace7 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1010,7 +1010,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) i_halfpel_thresh += i_ref_cost; if( m.cost < a->l0.me16x16.cost ) - a->l0.me16x16 = m; + h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); /* save mv for predicting neighbors */ *(uint32_t*)a->l0.mvc[i_ref][0] = @@ -1072,22 +1072,22 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t l0m->cost = INT_MAX; for( i_ref = 0; i_ref <= i_maxref; i_ref++ ) { - const int i_ref_cost = REF_COST( 0, i_ref ); - i_halfpel_thresh -= i_ref_cost; - m.i_ref_cost = i_ref_cost; - m.i_ref = i_ref; - - LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); - x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); - x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); - x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh ); - - m.cost += i_ref_cost; - i_halfpel_thresh += i_ref_cost; - *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv; - - if( m.cost < l0m->cost ) - *l0m = m; + const int i_ref_cost = REF_COST( 0, i_ref ); + i_halfpel_thresh -= i_ref_cost; + m.i_ref_cost = i_ref_cost; + m.i_ref = i_ref; + + LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 ); + x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref ); + x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); + x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh ); + + m.cost += i_ref_cost; + i_halfpel_thresh += i_ref_cost; + *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv; + + if( m.cost < l0m->cost ) + h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); } x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv ); x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref ); @@ -1176,25 +1176,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a ) l0m->cost = INT_MAX; for( j = 0; j < i_ref8s; j++ ) { - const int i_ref = ref8[j]; - const int i_ref_cost = REF_COST( 0, i_ref ); - m.i_ref_cost = i_ref_cost; - m.i_ref = i_ref; + const int i_ref = ref8[j]; + const int i_ref_cost = REF_COST( 0, i_ref ); + m.i_ref_cost = i_ref_cost; + m.i_ref = i_ref; - /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; - *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1]; - *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2]; + /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */ + *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; + *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1]; + *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2]; - LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); - x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); - x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); - x264_me_search( h, &m, mvc, 3 ); + LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i ); + x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref ); + x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp ); + x264_me_search( h, &m, mvc, 3 ); - m.cost += i_ref_cost; + m.cost += i_ref_cost; - if( m.cost < l0m->cost ) - *l0m = m; + if( m.cost < l0m->cost ) + h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); } x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv ); x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref ); @@ -1226,24 +1226,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a ) l0m->cost = INT_MAX; for( j = 0; j < i_ref8s; j++ ) { - const int i_ref = ref8[j]; - const int i_ref_cost = REF_COST( 0, i_ref ); - m.i_ref_cost = i_ref_cost; - m.i_ref = i_ref; + const int i_ref = ref8[j]; + const int i_ref_cost = REF_COST( 0, i_ref ); + m.i_ref_cost = i_ref_cost; + m.i_ref = i_ref; - *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; - *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1]; - *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3]; + *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0]; + *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1]; + *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3]; - LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); - x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); - x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); - x264_me_search( h, &m, mvc, 3 ); + LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 ); + x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref ); + x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp ); + x264_me_search( h, &m, mvc, 3 ); - m.cost += i_ref_cost; + m.cost += i_ref_cost; - if( m.cost < l0m->cost ) - *l0m = m; + if( m.cost < l0m->cost ) + h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) ); } x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv ); x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref ); @@ -1467,7 +1467,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) if( m.cost < a->l0.me16x16.cost ) { a->l0.i_ref = i_ref; - a->l0.me16x16 = m; + h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) ); } /* save mv for predicting neighbors */ @@ -1494,7 +1494,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) if( m.cost < a->l1.me16x16.cost ) { a->l1.i_ref = i_ref; - a->l1.me16x16 = m; + h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) ); } /* save mv for predicting neighbors */ diff --git a/encoder/me.h b/encoder/me.h index 96135c9e..6775a975 100644 --- a/encoder/me.h +++ b/encoder/me.h @@ -45,7 +45,7 @@ typedef struct int cost_mv; /* lambda * nbits for the chosen mv */ int cost; /* satd + lambda * nbits */ DECLARE_ALIGNED_4( int16_t mv[2] ); -} x264_me_t; +} DECLARE_ALIGNED_16( x264_me_t ); void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc ) diff --git a/encoder/slicetype.c b/encoder/slicetype.c index fff7bc45..d72e40a5 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -89,13 +89,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, } #define SAVE_MVS( mv0, mv1 ) \ { \ - fenc->mv[0][i_mb_xy][0] = mv0[0]; \ - fenc->mv[0][i_mb_xy][1] = mv0[1]; \ + *(uint32_t*)fenc->mv[0][i_mb_xy] = *(uint32_t*)mv0; \ if( b_bidir ) \ - { \ - fenc->mv[1][i_mb_xy][0] = mv1[0]; \ - fenc->mv[1][i_mb_xy][1] = mv1[1]; \ - } \ + *(uint32_t*)fenc->mv[1][i_mb_xy] = *(uint32_t*)mv1; \ } #define CLIP_MV( mv ) \ { \ @@ -133,7 +129,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, int dmv[2][2]; int mv0[2] = {0,0}; - m[1] = m[0]; + h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) ); LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres ); dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8; @@ -144,7 +140,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, CLIP_MV( dmv[1] ); TRY_BIDIR( dmv[0], dmv[1], 0 ); - if( dmv[0][0] || dmv[0][1] || dmv[1][0] || dmv[1][1] ) + if( dmv[0][0] | dmv[0][1] | dmv[1][0] | dmv[1][1] ) TRY_BIDIR( mv0, mv0, 0 ); // if( i_bcost < 60 ) // arbitrary threshold // return i_bcost; @@ -153,10 +149,10 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, i_cost_bak = i_bcost; for( l = 0; l < 1 + b_bidir; l++ ) { - int16_t mvc[4][2] = {{0}}; + DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}}; int i_mvc = 0; int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy]; -#define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; } +#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; } if( i_mb_x > 0 ) MVC(fenc_mv[-1]); if( i_mb_y > 0 ) @@ -172,12 +168,12 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, x264_me_search( h, &m[l], mvc, i_mvc ); m[l].cost -= 2; // remove mvcost from skip mbs - if( m[l].mv[0] || m[l].mv[1] ) + if( *(uint32_t*)m[l].mv ) m[l].cost += 5; i_bcost = X264_MIN( i_bcost, m[l].cost ); } - if( b_bidir && (m[0].mv[0] || m[0].mv[1] || m[1].mv[0] || m[1].mv[1]) ) + if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) ) TRY_BIDIR( m[0].mv, m[1].mv, 5 ); if( i_bcost < i_cost_bak ) -- 2.40.0