From 56108cb63848d4a553bccb7389226910f3f25e2e Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Sun, 15 Jun 2008 11:51:36 -0600
Subject: [PATCH] Use aligned memcpy for x264_me_t struct and cosmetics

---
 encoder/analyse.c   | 96 ++++++++++++++++++++++-----------------------
 encoder/me.h        |  2 +-
 encoder/slicetype.c | 20 ++++------
 3 files changed, 57 insertions(+), 61 deletions(-)

diff --git a/encoder/analyse.c b/encoder/analyse.c
index 17efe032..9200ace7 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1010,7 +1010,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
         i_halfpel_thresh += i_ref_cost;
 
         if( m.cost < a->l0.me16x16.cost )
-            a->l0.me16x16 = m;
+            h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
 
         /* save mv for predicting neighbors */
         *(uint32_t*)a->l0.mvc[i_ref][0] = 
@@ -1072,22 +1072,22 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
         l0m->cost = INT_MAX;
         for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
         {
-             const int i_ref_cost = REF_COST( 0, i_ref );
-             i_halfpel_thresh -= i_ref_cost;
-             m.i_ref_cost = i_ref_cost;
-             m.i_ref = i_ref;
-
-             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
-             x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
-             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
-             x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
-
-             m.cost += i_ref_cost;
-             i_halfpel_thresh += i_ref_cost;
-             *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
-
-             if( m.cost < l0m->cost )
-                 *l0m = m;
+            const int i_ref_cost = REF_COST( 0, i_ref );
+            i_halfpel_thresh -= i_ref_cost;
+            m.i_ref_cost = i_ref_cost;
+            m.i_ref = i_ref;
+
+            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*x8, 8*y8 );
+            x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, i_ref );
+            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+            x264_me_search_ref( h, &m, a->l0.mvc[i_ref], i+1, p_halfpel_thresh );
+
+            m.cost += i_ref_cost;
+            i_halfpel_thresh += i_ref_cost;
+            *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
+
+            if( m.cost < l0m->cost )
+                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
         }
         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
@@ -1176,25 +1176,25 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
         l0m->cost = INT_MAX;
         for( j = 0; j < i_ref8s; j++ )
         {
-             const int i_ref = ref8[j];
-             const int i_ref_cost = REF_COST( 0, i_ref );
-             m.i_ref_cost = i_ref_cost;
-             m.i_ref = i_ref;
+            const int i_ref = ref8[j];
+            const int i_ref_cost = REF_COST( 0, i_ref );
+            m.i_ref_cost = i_ref_cost;
+            m.i_ref = i_ref;
 
-             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
-             *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
-             *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
-             *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
+            /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
+            *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
+            *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
+            *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
 
-             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
-             x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
-             x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
-             x264_me_search( h, &m, mvc, 3 );
+            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
+            x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
+            x264_mb_predict_mv( h, 0, 8*i, 4, m.mvp );
+            x264_me_search( h, &m, mvc, 3 );
 
-             m.cost += i_ref_cost;
+            m.cost += i_ref_cost;
 
-             if( m.cost < l0m->cost )
-                 *l0m = m;
+            if( m.cost < l0m->cost )
+                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
         }
         x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
@@ -1226,24 +1226,24 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
         l0m->cost = INT_MAX;
         for( j = 0; j < i_ref8s; j++ )
         {
-             const int i_ref = ref8[j];
-             const int i_ref_cost = REF_COST( 0, i_ref );
-             m.i_ref_cost = i_ref_cost;
-             m.i_ref = i_ref;
+            const int i_ref = ref8[j];
+            const int i_ref_cost = REF_COST( 0, i_ref );
+            m.i_ref_cost = i_ref_cost;
+            m.i_ref = i_ref;
 
-             *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
-             *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
-             *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
+            *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
+            *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
+            *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
 
-             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
-             x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
-             x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
-             x264_me_search( h, &m, mvc, 3 );
+            LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
+            x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
+            x264_mb_predict_mv( h, 0, 4*i, 2, m.mvp );
+            x264_me_search( h, &m, mvc, 3 );
 
-             m.cost += i_ref_cost;
+            m.cost += i_ref_cost;
 
-             if( m.cost < l0m->cost )
-                 *l0m = m;
+            if( m.cost < l0m->cost )
+                h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
         }
         x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
@@ -1467,7 +1467,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         if( m.cost < a->l0.me16x16.cost )
         {
             a->l0.i_ref = i_ref;
-            a->l0.me16x16 = m;
+            h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
         }
 
         /* save mv for predicting neighbors */
@@ -1494,7 +1494,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         if( m.cost < a->l1.me16x16.cost )
         {
             a->l1.i_ref = i_ref;
-            a->l1.me16x16 = m;
+            h->mc.memcpy_aligned( &a->l1.me16x16, &m, sizeof(x264_me_t) );
         }
 
         /* save mv for predicting neighbors */
diff --git a/encoder/me.h b/encoder/me.h
index 96135c9e..6775a975 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -45,7 +45,7 @@ typedef struct
     int cost_mv;        /* lambda * nbits for the chosen mv */
     int cost;           /* satd + lambda * nbits */
     DECLARE_ALIGNED_4( int16_t mv[2] );
-} x264_me_t;
+} DECLARE_ALIGNED_16( x264_me_t );
 
 void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
 static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index fff7bc45..d72e40a5 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -89,13 +89,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     }
 #define SAVE_MVS( mv0, mv1 ) \
     { \
-        fenc->mv[0][i_mb_xy][0] = mv0[0]; \
-        fenc->mv[0][i_mb_xy][1] = mv0[1]; \
+        *(uint32_t*)fenc->mv[0][i_mb_xy] = *(uint32_t*)mv0; \
         if( b_bidir ) \
-        { \
-            fenc->mv[1][i_mb_xy][0] = mv1[0]; \
-            fenc->mv[1][i_mb_xy][1] = mv1[1]; \
-        } \
+            *(uint32_t*)fenc->mv[1][i_mb_xy] = *(uint32_t*)mv1; \
     }
 #define CLIP_MV( mv ) \
     { \
@@ -133,7 +129,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         int dmv[2][2];
         int mv0[2] = {0,0};
 
-        m[1] = m[0];
+        h->mc.memcpy_aligned( &m[1], &m[0], sizeof(x264_me_t) );
         LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
 
         dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
@@ -144,7 +140,7 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         CLIP_MV( dmv[1] );
 
         TRY_BIDIR( dmv[0], dmv[1], 0 );
-        if( dmv[0][0] || dmv[0][1] || dmv[1][0] || dmv[1][1] )
+        if( dmv[0][0] | dmv[0][1] | dmv[1][0] | dmv[1][1] )
            TRY_BIDIR( mv0, mv0, 0 );
 //      if( i_bcost < 60 ) // arbitrary threshold
 //          return i_bcost;
@@ -153,10 +149,10 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     i_cost_bak = i_bcost;
     for( l = 0; l < 1 + b_bidir; l++ )
     {
-        int16_t mvc[4][2] = {{0}};
+        DECLARE_ALIGNED_4(int16_t mvc[4][2]) = {{0}};
         int i_mvc = 0;
         int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
-#define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; }
+#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
         if( i_mb_x > 0 )
             MVC(fenc_mv[-1]);
         if( i_mb_y > 0 )
@@ -172,12 +168,12 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         x264_me_search( h, &m[l], mvc, i_mvc );
 
         m[l].cost -= 2; // remove mvcost from skip mbs
-        if( m[l].mv[0] || m[l].mv[1] )
+        if( *(uint32_t*)m[l].mv )
             m[l].cost += 5;
         i_bcost = X264_MIN( i_bcost, m[l].cost );
     }
 
-    if( b_bidir && (m[0].mv[0] || m[0].mv[1] || m[1].mv[0] || m[1].mv[1]) )
+    if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
         TRY_BIDIR( m[0].mv, m[1].mv, 5 );
 
     if( i_bcost < i_cost_bak )
-- 
2.40.0