From bfa2eac7fdc92eaf27004ef66e93898ec27f61f1 Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Thu, 15 May 2008 06:01:01 -0600
Subject: [PATCH] explicit write combining, because gcc fails at optimizing
 consecutive memory accesses

---
 common/common.h      |   6 +-
 common/macroblock.c  | 215 ++++++++++++++-----------------------------
 common/macroblock.h  |  55 ++++++-----
 common/osdep.h       |   3 +
 encoder/analyse.c    | 149 ++++++++++++++----------------
 encoder/cabac.c      |   4 +-
 encoder/cavlc.c      |   6 +-
 encoder/macroblock.c |  10 +-
 encoder/me.c         |   6 +-
 encoder/me.h         |   8 +-
 encoder/slicetype.c  |   4 +-
 11 files changed, 196 insertions(+), 270 deletions(-)

diff --git a/common/common.h b/common/common.h
index 628c0009..a53509b4 100644
--- a/common/common.h
+++ b/common/common.h
@@ -471,14 +471,14 @@ struct x264_t
 
             /* 0 if not available */
             DECLARE_ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
-            DECLARE_ALIGNED_4( int16_t mvd[2][X264_SCAN8_SIZE][2] );
+            DECLARE_ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
 
             /* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
             DECLARE_ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
 
             DECLARE_ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
-            int8_t  direct_ref[2][X264_SCAN8_SIZE];
-            int     pskip_mv[2];
+            DECLARE_ALIGNED_4( int8_t  direct_ref[2][X264_SCAN8_SIZE] );
+            DECLARE_ALIGNED_4( int16_t pskip_mv[2] );
 
             /* number of neighbors (top and left) that used 8x8 dct */
             int     i_neighbour_transform_size;
diff --git a/common/macroblock.c b/common/macroblock.c
index cd1f9cc7..d2fc0cbc 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -88,7 +88,7 @@ int x264_mb_transform_8x8_allowed( x264_t *h )
     }
 }
 
-void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] )
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
 {
     const int i8 = x264_scan8[idx];
     const int i_ref= h->mb.cache.ref[i_list][i8];
@@ -111,14 +111,12 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2]
     {
         if( idx == 0 && i_refb == i_ref )
         {
-            mvp[0] = mv_b[0];
-            mvp[1] = mv_b[1];
+            *(uint32_t*)mvp = *(uint32_t*)mv_b;
             return;
         }
         else if( idx != 0 && i_refa == i_ref )
         {
-            mvp[0] = mv_a[0];
-            mvp[1] = mv_a[1];
+            *(uint32_t*)mvp = *(uint32_t*)mv_a;
             return;
         }
     }
@@ -126,14 +124,12 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2]
     {
         if( idx == 0 && i_refa == i_ref )
         {
-            mvp[0] = mv_a[0];
-            mvp[1] = mv_a[1];
+            *(uint32_t*)mvp = *(uint32_t*)mv_a;
             return;
         }
         else if( idx != 0 && i_refc == i_ref )
         {
-            mvp[0] = mv_c[0];
-            mvp[1] = mv_c[1];
+            *(uint32_t*)mvp = *(uint32_t*)mv_c;
             return;
         }
     }
@@ -151,26 +147,14 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2]
     else if( i_count == 1 )
     {
         if( i_refa == i_ref )
-        {
-            mvp[0] = mv_a[0];
-            mvp[1] = mv_a[1];
-        }
+            *(uint32_t*)mvp = *(uint32_t*)mv_a;
         else if( i_refb == i_ref )
-        {
-            mvp[0] = mv_b[0];
-            mvp[1] = mv_b[1];
-        }
+            *(uint32_t*)mvp = *(uint32_t*)mv_b;
         else
-        {
-            mvp[0] = mv_c[0];
-            mvp[1] = mv_c[1];
-        }
+            *(uint32_t*)mvp = *(uint32_t*)mv_c;
     }
     else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-    {
-        mvp[0] = mv_a[0];
-        mvp[1] = mv_a[1];
-    }
+        *(uint32_t*)mvp = *(uint32_t*)mv_a;
     else
     {
         mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
@@ -178,7 +162,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2]
     }
 }
 
-void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] )
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] )
 {
     int     i_refa = h->mb.cache.ref[i_list][X264_SCAN8_0 - 1];
     int16_t *mv_a  = h->mb.cache.mv[i_list][X264_SCAN8_0 - 1];
@@ -208,26 +192,14 @@ void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] )
     else if( i_count == 1 )
     {
         if( i_refa == i_ref )
-        {
-            mvp[0] = mv_a[0];
-            mvp[1] = mv_a[1];
-        }
+            *(uint32_t*)mvp = *(uint32_t*)mv_a;
         else if( i_refb == i_ref )
-        {
-            mvp[0] = mv_b[0];
-            mvp[1] = mv_b[1];
-        }
+            *(uint32_t*)mvp = *(uint32_t*)mv_b;
         else
-        {
-            mvp[0] = mv_c[0];
-            mvp[1] = mv_c[1];
-        }
+            *(uint32_t*)mvp = *(uint32_t*)mv_c;
     }
     else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-    {
-        mvp[0] = mv_a[0];
-        mvp[1] = mv_a[1];
-    }
+        *(uint32_t*)mvp = *(uint32_t*)mv_a;
     else
     {
         mvp[0] = x264_median( mv_a[0], mv_b[0], mv_c[0] );
@@ -236,7 +208,7 @@ void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] )
 }
 
 
-void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] )
+void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
 {
     int     i_refa = h->mb.cache.ref[0][X264_SCAN8_0 - 1];
     int     i_refb = h->mb.cache.ref[0][X264_SCAN8_0 - 8];
@@ -244,10 +216,10 @@ void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] )
     int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
 
     if( i_refa == -2 || i_refb == -2 ||
-        ( i_refa == 0 && mv_a[0] == 0 && mv_a[1] == 0 ) ||
-        ( i_refb == 0 && mv_b[0] == 0 && mv_b[1] == 0 ) )
+        ( i_refa == 0 && *(uint32_t*)mv_a == 0 ) ||
+        ( i_refb == 0 && *(uint32_t*)mv_b == 0 ) )
     {
-        mv[0] = mv[1] = 0;
+        *(uint32_t*)mv = 0;
     }
     else
     {
@@ -268,8 +240,8 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
     if( IS_INTRA( type_col ) )
     {
         x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
-        x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 0, 0, 0 );
-        x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 1, 0, 0 );
+        x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 0, 0 );
+        x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 1, 0 );
         return 1;
     }
     b8x8 = h->sps->b_direct8x8_inference ||
@@ -291,11 +263,10 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
             if( b8x8 )
             {
                 const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + 3*x8 + 3*y8 * h->mb.i_b4_stride];
-                int mv_l0[2];
-                mv_l0[0] = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
-                mv_l0[1] = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
-                x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, mv_l0[0], mv_l0[1] );
-                x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, mv_l0[0] - mv_col[0], mv_l0[1] - mv_col[1] );
+                const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
+                const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
+                x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, pack16to32_mask(l0x, l0y) );
+                x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
             }
             else
             {
@@ -304,11 +275,10 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
                     const int x4 = i4%2 + 2*x8;
                     const int y4 = i4/2 + 2*y8;
                     const int16_t *mv_col = h->fref1[0]->mv[0][ i_mb_4x4 + x4 + y4 * h->mb.i_b4_stride ];
-                    int mv_l0[2];
-                    mv_l0[0] = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
-                    mv_l0[1] = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
-                    x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, mv_l0[0], mv_l0[1] );
-                    x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, mv_l0[0] - mv_col[0], mv_l0[1] - mv_col[1] );
+                    const int l0x = ( dist_scale_factor * mv_col[0] + 128 ) >> 8;
+                    const int l0y = ( dist_scale_factor * mv_col[1] + 128 ) >> 8;
+                    x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, pack16to32_mask(l0x, l0y) );
+                    x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, pack16to32_mask(l0x-mv_col[0], l0y-mv_col[1]) );
                 }
             }
         }
@@ -350,7 +320,7 @@ static int x264_mb_predict_mv_direct16x16_temporal( x264_t *h )
 static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
 {
     int ref[2];
-    int mv[2][2];
+    DECLARE_ALIGNED_4( int16_t mv[2][2] );
     int i_list;
     int i8, i4;
     int b8x8;
@@ -381,10 +351,7 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
     {
         ref[0] = 
         ref[1] = 0;
-        mv[0][0] = 
-        mv[0][1] = 
-        mv[1][0] = 
-        mv[1][1] = 0;
+        *(uint64_t*)mv[0] = 0;
     }
     else
     {
@@ -393,14 +360,14 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
             if( ref[i_list] >= 0 )
                 x264_mb_predict_mv_16x16( h, i_list, ref[i_list], mv[i_list] );
             else
-                mv[i_list][0] = mv[i_list][1] = 0;
+                *(uint32_t*)mv[i_list] = 0;
         }
     }
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
-    x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 0, mv[0][0], mv[0][1] );
-    x264_macroblock_cache_mv(  h, 0, 0, 4, 4, 1, mv[1][0], mv[1][1] );
+    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, mv[0] );
+    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, mv[1] );
 
     if( IS_INTRA( type_col ) )
         return 1;
@@ -435,9 +402,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
                 if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 )
                 {
                     if( ref[0] == 0 )
-                        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0, 0 );
+                        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, 0 );
                     if( ref[1] == 0 )
-                        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0, 0 );
+                        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 1, 0 );
                 }
             }
             else
@@ -450,9 +417,9 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
                     if( abs( mvcol[0] ) <= 1 && abs( mvcol[1] ) <= 1 )
                     {
                         if( ref[0] == 0 )
-                            x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, 0, 0 );
+                            x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, 0 );
                         if( ref[1] == 0 )
-                            x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, 0, 0 );
+                            x264_macroblock_cache_mv( h, x4, y4, 1, 1, 1, 0 );
                     }
                 }
             }
@@ -527,14 +494,13 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
 #define FIXED_SCALE 256
 
 /* This just improves encoder performance, it's not part of the spec */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[8][2], int *i_mvc )
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc )
 {
     int16_t (*mvr)[2] = h->mb.mvr[i_list][i_ref];
     int i = 0;
 
 #define SET_MVP(mvp) { \
-        mvc[i][0] = mvp[0]; \
-        mvc[i][1] = mvp[1]; \
+        *(uint32_t*)mvc[i] = *(uint32_t*)mvp; \
         i++; \
     }
 
@@ -1262,15 +1228,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 const int ir = i_top_8x8 - 1;
                 const int iv = i_top_4x4 - 1;
                 h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
-                h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0];
-                h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1];
+                *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
             }
             else
             {
                 const int i8 = x264_scan8[0] - 1 - 1*8;
                 h->mb.cache.ref[i_list][i8] = -2;
-                h->mb.cache.mv[i_list][i8][0] = 0;
-                h->mb.cache.mv[i_list][i8][1] = 0;
+                *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
             }
 
             if( h->mb.i_neighbour & MB_TOP )
@@ -1282,22 +1246,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0];
                 h->mb.cache.ref[i_list][i8+2] =
                 h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1];
-
-                for( i = 0; i < 4; i++ )
-                {
-                    h->mb.cache.mv[i_list][i8+i][0] = h->mb.mv[i_list][iv + i][0];
-                    h->mb.cache.mv[i_list][i8+i][1] = h->mb.mv[i_list][iv + i][1];
-                }
+                *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = *(uint64_t*)h->mb.mv[i_list][iv+0];
+                *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = *(uint64_t*)h->mb.mv[i_list][iv+2];
             }
             else
             {
                 const int i8 = x264_scan8[0] - 8;
-                for( i = 0; i < 4; i++ )
-                {
-                    h->mb.cache.ref[i_list][i8+i] = -2;
-                    h->mb.cache.mv[i_list][i8+i][0] =
-                    h->mb.cache.mv[i_list][i8+i][1] = 0;
-                }
+                *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = 0;
+                *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = 0;
+                *(uint32_t*)&h->mb.cache.ref[i_list][i8] = (uint8_t)(-2) * 0x01010101U;
             }
 
             if( h->mb.i_neighbour & MB_TOPRIGHT )
@@ -1306,15 +1263,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 const int ir = i_top_8x8 + 2;
                 const int iv = i_top_4x4 + 4;
                 h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
-                h->mb.cache.mv[i_list][i8][0] = h->mb.mv[i_list][iv][0];
-                h->mb.cache.mv[i_list][i8][1] = h->mb.mv[i_list][iv][1];
+                *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
             }
             else
             {
                 const int i8 = x264_scan8[0] + 4 - 1*8;
                 h->mb.cache.ref[i_list][i8] = -2;
-                h->mb.cache.mv[i_list][i8][0] = 0;
-                h->mb.cache.mv[i_list][i8][1] = 0;
+                *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
             }
 
             if( h->mb.i_neighbour & MB_LEFT )
@@ -1328,10 +1283,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
 
                 for( i = 0; i < 4; i++ )
-                {
-                    h->mb.cache.mv[i_list][i8+i*8][0] = h->mb.mv[i_list][iv + i*s4x4][0];
-                    h->mb.cache.mv[i_list][i8+i*8][1] = h->mb.mv[i_list][iv + i*s4x4][1];
-                }
+                    *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = *(uint32_t*)h->mb.mv[i_list][iv + i*s4x4];
             }
             else
             {
@@ -1339,8 +1291,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 for( i = 0; i < 4; i++ )
                 {
                     h->mb.cache.ref[i_list][i8+i*8] = -2;
-                    h->mb.cache.mv[i_list][i8+i*8][0] =
-                    h->mb.cache.mv[i_list][i8+i*8][1] = 0;
+                    *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = 0;
                 }
             }
 
@@ -1350,20 +1301,14 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 {
                     const int i8 = x264_scan8[0] - 8;
                     const int iv = i_top_4x4;
-                    for( i = 0; i < 4; i++ )
-                    {
-                        h->mb.cache.mvd[i_list][i8+i][0] = h->mb.mvd[i_list][iv + i][0];
-                        h->mb.cache.mvd[i_list][i8+i][1] = h->mb.mvd[i_list][iv + i][1];
-                    }
+                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = *(uint64_t*)h->mb.mvd[i_list][iv+0];
+                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = *(uint64_t*)h->mb.mvd[i_list][iv+2];
                 }
                 else
                 {
                     const int i8 = x264_scan8[0] - 8;
-                    for( i = 0; i < 4; i++ )
-                    {
-                        h->mb.cache.mvd[i_list][i8+i][0] =
-                        h->mb.cache.mvd[i_list][i8+i][1] = 0;
-                    }
+                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] =
+                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = 0;
                 }
 
                 if( i_left_type >= 0 )
@@ -1371,19 +1316,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                     const int i8 = x264_scan8[0] - 1;
                     const int iv = i_mb_4x4 - 1;
                     for( i = 0; i < 4; i++ )
-                    {
-                        h->mb.cache.mvd[i_list][i8+i*8][0] = h->mb.mvd[i_list][iv + i*s4x4][0];
-                        h->mb.cache.mvd[i_list][i8+i*8][1] = h->mb.mvd[i_list][iv + i*s4x4][1];
-                    }
+                        *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = *(uint32_t*)h->mb.mvd[i_list][iv + i*s4x4];
                 }
                 else
                 {
                     const int i8 = x264_scan8[0] - 1;
                     for( i = 0; i < 4; i++ )
-                    {
-                        h->mb.cache.mvd[i_list][i8+i*8][0] =
-                        h->mb.cache.mvd[i_list][i8+i*8][1] = 0;
-                    }
+                        *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = 0;
                 }
             }
         }
@@ -1516,7 +1455,7 @@ void x264_macroblock_cache_save( x264_t *h )
         int i_list;
         for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
         {
-            int y,x;
+            int y;
 
             h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[0]];
             h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[i_list][x264_scan8[4]];
@@ -1525,11 +1464,8 @@ void x264_macroblock_cache_save( x264_t *h )
 
             for( y = 0; y < 4; y++ )
             {
-                for( x = 0; x < 4; x++ )
-                {
-                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][0];
-                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mv[i_list][x264_scan8[0]+x+8*y][1];
-                }
+                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+0];
+                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[i_list][x264_scan8[0]+8*y+2];
             }
         }
     }
@@ -1538,20 +1474,15 @@ void x264_macroblock_cache_save( x264_t *h )
         int i_list;
         for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
         {
-            int y,x;
+            int y;
 
-            h->mb.ref[i_list][i_mb_8x8+0+0*s8x8] =
-            h->mb.ref[i_list][i_mb_8x8+1+0*s8x8] =
-            h->mb.ref[i_list][i_mb_8x8+0+1*s8x8] =
-            h->mb.ref[i_list][i_mb_8x8+1+1*s8x8] = -1;
+            *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
+            *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
 
             for( y = 0; y < 4; y++ )
             {
-                for( x = 0; x < 4; x++ )
-                {
-                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][0] = 0;
-                    h->mb.mv[i_list][i_mb_4x4+x+y*s4x4][1] = 0;
-                }
+                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
+                *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0;
             }
         }
     }
@@ -1569,14 +1500,11 @@ void x264_macroblock_cache_save( x264_t *h )
             for( i_list  = 0; i_list < 2; i_list++ )
             {
                 const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y,x;
+                int y;
                 for( y = 0; y < 4; y++ )
                 {
-                    for( x = 0; x < 4; x++ )
-                    {
-                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][0];
-                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = h->mb.cache.mvd[i_list][x264_scan8[0]+x+8*y][1];
-                    }
+                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+0];
+                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[i_list][x264_scan8[0]+8*y+2];
                 }
             }
         }
@@ -1586,14 +1514,11 @@ void x264_macroblock_cache_save( x264_t *h )
             for( i_list  = 0; i_list < 2; i_list++ )
             {
                 const int s4x4 = 4 * h->mb.i_mb_stride;
-                int y,x;
+                int y;
                 for( y = 0; y < 4; y++ )
                 {
-                    for( x = 0; x < 4; x++ )
-                    {
-                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][0] = 0;
-                        h->mb.mvd[i_list][i_mb_4x4+x+y*s4x4][1] = 0;
-                    }
+                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+0] = 0;
+                    *(uint64_t*)h->mb.mvd[i_list][i_mb_4x4+y*s4x4+2] = 0;
                 }
             }
         }
diff --git a/common/macroblock.h b/common/macroblock.h
index e127e2b7..2766ddd0 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -253,16 +253,16 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y )
 /* x264_mb_predict_mv_16x16:
  *      set mvp with predicted mv for D_16x16 block
  *      h->mb. need only valid values from other blocks */
-void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int mvp[2] );
+void x264_mb_predict_mv_16x16( x264_t *h, int i_list, int i_ref, int16_t mvp[2] );
 /* x264_mb_predict_mv_pskip:
  *      set mvp with predicted mv for P_SKIP
  *      h->mb. need only valid values from other blocks */
-void x264_mb_predict_mv_pskip( x264_t *h, int mv[2] );
+void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] );
 /* x264_mb_predict_mv:
  *      set mvp with predicted mv for all blocks except SKIP and DIRECT
  *      h->mb. need valid ref/partition/sub of current block to be valid
  *      and valid mv/ref from other blocks. */
-void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] );
+void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] );
 /* x264_mb_predict_mv_direct16x16:
  *      set h->mb.cache.mv and h->mb.cache.ref for B_SKIP or B_DIRECT
  *      h->mb. need only valid values from other blocks.
@@ -278,7 +278,7 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx );
  *      set mvc with D_16x16 prediction.
  *      uses all neighbors, even those that didn't end up using this ref.
  *      h->mb. need only valid values from other blocks */
-void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[8][2], int *i_mvc );
+void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[8][2], int *i_mvc );
 
 
 int  x264_mb_predict_intra4x4_mode( x264_t *h, int idx );
@@ -293,7 +293,14 @@ int  x264_mb_transform_8x8_allowed( x264_t *h );
 void x264_mb_mc( x264_t *h );
 void x264_mb_mc_8x8( x264_t *h, int i8 );
 
-
+static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
+{
+#ifdef WORDS_BIGENDIAN
+   return (b&0xFFFF) + (a<<16);
+#else
+   return (a&0xFFFF) + (b<<16);
+#endif
+}
 static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val )
 {
     int dy;
@@ -313,30 +320,32 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int
 static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
 {
     int dy, dx;
-    for( dy = 0; dy < height; dy++ )
-        for( dx = 0; dx < width; dx++ )
-            ((uint32_t*)dst)[dx+8*dy] = val;
-}
-static ALWAYS_INLINE uint32_t pack16to32_clip( int a, int b )
-{
-#ifdef WORDS_BIGENDIAN
-   return (b&0xFFFF) + (a<<16);
-#else
-   return (a&0xFFFF) + (b<<16);
-#endif
+    if( width == 1 || WORD_SIZE < 8 )
+    {
+        for( dy = 0; dy < height; dy++ )
+            for( dx = 0; dx < width; dx++ )
+                ((uint32_t*)dst)[dx+8*dy] = val;
+    }
+    else
+    {
+        uint64_t val64 = val + ((uint64_t)val<<32);
+        for( dy = 0; dy < height; dy++ )
+            for( dx = 0; dx < width/2; dx++ )
+                ((uint64_t*)dst)[dx+4*dy] = val64;
+    }
 }
-
-static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
+#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv)
+static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
 {
-    x264_macroblock_cache_rect1( &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y], width, height, ref );
+    x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
 }
-static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, int mvx, int mvy )
+static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
 {
-    x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32_clip(mvx,mvy) );
+    x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
 }
-static ALWAYS_INLINE void x264_macroblock_cache_mvd( x264_t *h, int x, int y, int width, int height, int i_list, int mdx, int mdy )
+static ALWAYS_INLINE void x264_macroblock_cache_ref( x264_t *h, int x, int y, int width, int height, int i_list, uint8_t ref )
 {
-    x264_macroblock_cache_rect4( &h->mb.cache.mvd[i_list][X264_SCAN8_0+x+8*y], width, height, pack16to32_clip(mdx,mdy) );
+    x264_macroblock_cache_rect1( &h->mb.cache.ref[i_list][X264_SCAN8_0+x+8*y], width, height, ref );
 }
 static ALWAYS_INLINE void x264_macroblock_cache_skip( x264_t *h, int x, int y, int width, int height, int b_skip )
 {
diff --git a/common/osdep.h b/common/osdep.h
index d914e784..c7353e0d 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -127,4 +127,7 @@
 #define x264_pthread_cond_wait(c,m)  usleep(100)
 #endif
 
+/* FIXME: long isn't always the native register size (e.g. win64). */
+#define WORD_SIZE sizeof(long)
+
 #endif /* X264_OSDEP_H */
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 9da9a5c7..dad24b1f 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -45,7 +45,7 @@ typedef struct
     /* 8x8 */
     int       i_cost8x8;
     /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
-    DECLARE_ALIGNED_8( int mvc[32][5][2] );
+    DECLARE_ALIGNED_4( int16_t mvc[32][5][2] );
     x264_me_t me8x8[4];
 
     /* Sub 4x4 */
@@ -967,8 +967,8 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
-    int i_ref;
-    int mvc[7][2], i_mvc;
+    int i_ref, i_mvc;
+    DECLARE_ALIGNED_4( int16_t mvc[7][2] );
     int i_halfpel_thresh = INT_MAX;
     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
 
@@ -1013,10 +1013,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
             a->l0.me16x16 = m;
 
         /* save mv for predicting neighbors */
-        a->l0.mvc[i_ref][0][0] =
-        h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
-        a->l0.mvc[i_ref][0][1] =
-        h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
+        *(uint32_t*)a->l0.mvc[i_ref][0] = 
+        *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
     }
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
@@ -1024,11 +1022,10 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
 
     h->mb.i_type = P_L0;
     if( a->b_mbrd && a->l0.me16x16.i_ref == 0
-        && a->l0.me16x16.mv[0] == h->mb.cache.pskip_mv[0]
-        && a->l0.me16x16.mv[1] == h->mb.cache.pskip_mv[1] )
+        && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
     {
         h->mb.i_partition = D_16x16;
-        x264_macroblock_cache_mv( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+        x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
         a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 );
     }
 }
@@ -1060,10 +1057,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
     }
 
     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
-    {
-         a->l0.mvc[i_ref][0][0] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0];
-         a->l0.mvc[i_ref][0][1] = h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1];
-    }
+         *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
 
     for( i = 0; i < 4; i++ )
     {
@@ -1090,12 +1084,12 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
 
              m.cost += i_ref_cost;
              i_halfpel_thresh += i_ref_cost;
-             *(uint64_t*)a->l0.mvc[i_ref][i+1] = *(uint64_t*)m.mv;
+             *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
 
              if( m.cost < l0m->cost )
                  *l0m = m;
         }
-        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv[0], l0m->mv[1] );
+        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, l0m->mv );
         x264_macroblock_cache_ref( h, 2*x8, 2*y8, 2, 2, 0, l0m->i_ref );
 
         /* mb type cost */
@@ -1115,14 +1109,14 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
     uint8_t  **p_fref = h->mb.pic.p_fref[0][i_ref];
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
     int i_mvc;
-    int (*mvc)[2] = a->l0.mvc[i_ref];
+    int16_t (*mvc)[2] = a->l0.mvc[i_ref];
     int i;
 
     /* XXX Needed for x264_mb_predict_mv */
     h->mb.i_partition = D_8x8;
 
     i_mvc = 1;
-    *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.me16x16.mv;
+    *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
 
     for( i = 0; i < 4; i++ )
     {
@@ -1140,9 +1134,9 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
         x264_mb_predict_mv( h, 0, 4*i, 2, m->mvp );
         x264_me_search( h, m, mvc, i_mvc );
 
-        x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, 0, m->mv[0], m->mv[1] );
+        x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
 
-        *(uint64_t*)mvc[i_mvc] = *(uint64_t*)m->mv;
+        *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
         i_mvc++;
 
         /* mb type cost */
@@ -1163,7 +1157,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    DECLARE_ALIGNED_8( int mvc[3][2] );
+    DECLARE_ALIGNED_4( int16_t mvc[3][2] );
     int i, j;
 
     /* XXX Needed for x264_mb_predict_mv */
@@ -1188,9 +1182,9 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
              m.i_ref = i_ref;
 
              /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
-             *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
-             *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][2*i+1];
-             *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][2*i+2];
+             *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
+             *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
+             *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
 
              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
              x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, i_ref );
@@ -1202,7 +1196,7 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
              if( m.cost < l0m->cost )
                  *l0m = m;
         }
-        x264_macroblock_cache_mv( h, 0, 2*i, 4, 2, 0, l0m->mv[0], l0m->mv[1] );
+        x264_macroblock_cache_mv_ptr( h, 0, 2*i, 4, 2, 0, l0m->mv );
         x264_macroblock_cache_ref( h, 0, 2*i, 4, 2, 0, l0m->i_ref );
     }
 
@@ -1213,7 +1207,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
 {
     x264_me_t m;
     uint8_t  **p_fenc = h->mb.pic.p_fenc;
-    DECLARE_ALIGNED_8( int mvc[3][2] );
+    DECLARE_ALIGNED_4( int16_t mvc[3][2] );
     int i, j;
 
     /* XXX Needed for x264_mb_predict_mv */
@@ -1237,9 +1231,9 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
              m.i_ref_cost = i_ref_cost;
              m.i_ref = i_ref;
 
-             *(uint64_t*)mvc[0] = *(uint64_t*)a->l0.mvc[i_ref][0];
-             *(uint64_t*)mvc[1] = *(uint64_t*)a->l0.mvc[i_ref][i+1];
-             *(uint64_t*)mvc[2] = *(uint64_t*)a->l0.mvc[i_ref][i+3];
+             *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
+             *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
+             *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
 
              LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
              x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, i_ref );
@@ -1251,7 +1245,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
              if( m.cost < l0m->cost )
                  *l0m = m;
         }
-        x264_macroblock_cache_mv( h, 2*i, 0, 2, 4, 0, l0m->mv[0], l0m->mv[1] );
+        x264_macroblock_cache_mv_ptr( h, 2*i, 0, 2, 4, 0, l0m->mv );
         x264_macroblock_cache_ref( h, 2*i, 0, 2, 4, 0, l0m->i_ref );
     }
 
@@ -1320,7 +1314,7 @@ static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8
         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
         x264_me_search( h, m, &a->l0.me8x8[i8x8].mv, i_mvc );
 
-        x264_macroblock_cache_mv( h, x4, y4, 1, 1, 0, m->mv[0], m->mv[1] );
+        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 1, 0, m->mv );
     }
     a->l0.i_cost4x4[i8x8] = a->l0.me4x4[i8x8][0].cost +
                             a->l0.me4x4[i8x8][1].cost +
@@ -1360,7 +1354,7 @@ static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8
         x264_mb_predict_mv( h, 0, idx, 2, m->mvp );
         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 
-        x264_macroblock_cache_mv( h, x4, y4, 2, 1, 0, m->mv[0], m->mv[1] );
+        x264_macroblock_cache_mv_ptr( h, x4, y4, 2, 1, 0, m->mv );
     }
     a->l0.i_cost8x4[i8x8] = a->l0.me8x4[i8x8][0].cost + a->l0.me8x4[i8x8][1].cost +
                             REF_COST( 0, i_ref ) +
@@ -1397,7 +1391,7 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
         x264_mb_predict_mv( h, 0, idx, 1, m->mvp );
         x264_me_search( h, m, &a->l0.me4x4[i8x8][0].mv, i_mvc );
 
-        x264_macroblock_cache_mv( h, x4, y4, 1, 2, 0, m->mv[0], m->mv[1] );
+        x264_macroblock_cache_mv_ptr( h, x4, y4, 1, 2, 0, m->mv );
     }
     a->l0.i_cost4x8[i8x8] = a->l0.me4x8[i8x8][0].cost + a->l0.me4x8[i8x8][1].cost +
                             REF_COST( 0, i_ref ) +
@@ -1447,8 +1441,8 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
     int weight;
 
     x264_me_t m;
-    int i_ref;
-    int mvc[8][2], i_mvc;
+    int i_ref, i_mvc;
+    DECLARE_ALIGNED_4( int16_t mvc[8][2] );
     int i_halfpel_thresh = INT_MAX;
     int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
 
@@ -1477,8 +1471,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         }
 
         /* save mv for predicting neighbors */
-        h->mb.mvr[0][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
-        h->mb.mvr[0][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
+        *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
     }
     /* subtract ref cost, so we don't have to add it for the other MB types */
     a->l0.me16x16.cost -= REF_COST( 0, a->l0.i_ref );
@@ -1505,8 +1498,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         }
 
         /* save mv for predicting neighbors */
-        h->mb.mvr[1][i_ref][h->mb.i_mb_xy][0] = m.mv[0];
-        h->mb.mvr[1][i_ref][h->mb.i_mb_xy][1] = m.mv[1];
+        *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
     }
     /* subtract ref cost, so we don't have to add it for the other MB types */
     a->l1.me16x16.cost -= REF_COST( 1, a->l1.i_ref );
@@ -1517,7 +1509,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 
     /* get cost of BI mode */
     weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
-    if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
+    if ( (*(uint32_t*)a->l0.me16x16.mv & 0x10001) == 0 )
     {
         /* l0 reference is halfpel, so get_ref on it will make it faster */
         src2 = 
@@ -1570,21 +1562,21 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int
     switch( h->mb.i_sub_partition[i] )
     {
         case D_L0_8x8:
-            x264_macroblock_cache_mv( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv[0], a->l0.me8x8[i].mv[1] );
+            x264_macroblock_cache_mv_ptr( h, x, y, 2, 2, 0, a->l0.me8x8[i].mv );
             break;
         case D_L0_8x4:
-            x264_macroblock_cache_mv( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv[0], a->l0.me8x4[i][0].mv[1] );
-            x264_macroblock_cache_mv( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv[0], a->l0.me8x4[i][1].mv[1] );
+            x264_macroblock_cache_mv_ptr( h, x, y+0, 2, 1, 0, a->l0.me8x4[i][0].mv );
+            x264_macroblock_cache_mv_ptr( h, x, y+1, 2, 1, 0, a->l0.me8x4[i][1].mv );
             break;
         case D_L0_4x8:
-            x264_macroblock_cache_mv( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv[0], a->l0.me4x8[i][0].mv[1] );
-            x264_macroblock_cache_mv( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv[0], a->l0.me4x8[i][1].mv[1] );
+            x264_macroblock_cache_mv_ptr( h, x+0, y, 1, 2, 0, a->l0.me4x8[i][0].mv );
+            x264_macroblock_cache_mv_ptr( h, x+1, y, 1, 2, 0, a->l0.me4x8[i][1].mv );
             break;
         case D_L0_4x4:
-            x264_macroblock_cache_mv( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv[0], a->l0.me4x4[i][0].mv[1] );
-            x264_macroblock_cache_mv( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv[0], a->l0.me4x4[i][1].mv[1] );
-            x264_macroblock_cache_mv( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv[0], a->l0.me4x4[i][2].mv[1] );
-            x264_macroblock_cache_mv( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv[0], a->l0.me4x4[i][3].mv[1] );
+            x264_macroblock_cache_mv_ptr( h, x+0, y+0, 1, 1, 0, a->l0.me4x4[i][0].mv );
+            x264_macroblock_cache_mv_ptr( h, x+1, y+0, 1, 1, 0, a->l0.me4x4[i][1].mv );
+            x264_macroblock_cache_mv_ptr( h, x+0, y+1, 1, 1, 0, a->l0.me4x4[i][2].mv );
+            x264_macroblock_cache_mv_ptr( h, x+1, y+1, 1, 1, 0, a->l0.me4x4[i][3].mv );
             break;
         default:
             x264_log( h, X264_LOG_ERROR, "internal error\n" );
@@ -1596,26 +1588,26 @@ static inline void x264_mb_cache_mv_p8x8( x264_t *h, x264_mb_analysis_t *a, int
     if( x264_mb_partition_listX_table[0][part] ) \
     { \
         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, a->l0.i_ref ); \
-        x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, me0.mv[0], me0.mv[1] ); \
+        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 0, me0.mv ); \
     } \
     else \
     { \
         x264_macroblock_cache_ref( h, x,y,dx,dy, 0, -1 ); \
-        x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0, 0 ); \
+        x264_macroblock_cache_mv(  h, x,y,dx,dy, 0, 0 ); \
         if( b_mvd ) \
-            x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0, 0 ); \
+            x264_macroblock_cache_mvd( h, x,y,dx,dy, 0, 0 ); \
     } \
     if( x264_mb_partition_listX_table[1][part] ) \
     { \
         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, a->l1.i_ref ); \
-        x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, me1.mv[0], me1.mv[1] ); \
+        x264_macroblock_cache_mv_ptr( h, x,y,dx,dy, 1, me1.mv ); \
     } \
     else \
     { \
         x264_macroblock_cache_ref( h, x,y,dx,dy, 1, -1 ); \
-        x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0, 0 ); \
+        x264_macroblock_cache_mv(  h, x,y,dx,dy, 1, 0 ); \
         if( b_mvd ) \
-            x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0, 0 ); \
+            x264_macroblock_cache_mvd( h, x,y,dx,dy, 1, 0 ); \
     }
 
 static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int i, int b_mvd )
@@ -1627,8 +1619,8 @@ static inline void x264_mb_cache_mv_b8x8( x264_t *h, x264_mb_analysis_t *a, int
         x264_mb_load_mv_direct8x8( h, i );
         if( b_mvd )
         {
-            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0, 0 );
-            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0, 0 );
+            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 0, 0 );
+            x264_macroblock_cache_mvd(  h, x, y, 2, 2, 1, 0 );
             x264_macroblock_cache_skip( h, x, y, 2, 2, 1 );
         }
     }
@@ -1681,7 +1673,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
             x264_me_search( h, m, &lX->me16x16.mv, 1 );
 
-            x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
+            x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
 
             /* BI mode */
             h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
@@ -1717,7 +1709,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
         { h->mb.pic.p_fref[0][a->l0.i_ref],
           h->mb.pic.p_fref[1][a->l1.i_ref] };
     DECLARE_ALIGNED_16( uint8_t  pix[2][16*8] );
-    DECLARE_ALIGNED_8( int mvc[2][2] );
+    DECLARE_ALIGNED_4( int16_t mvc[2][2] );
     int i, l;
 
     h->mb.i_partition = D_16x8;
@@ -1740,8 +1732,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
 
-            *(uint64_t*)mvc[0] = *(uint64_t*)lX->me8x8[2*i].mv;
-            *(uint64_t*)mvc[1] = *(uint64_t*)lX->me8x8[2*i+1].mv;
+            *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
+            *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
 
             x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
             x264_me_search( h, m, mvc, 2 );
@@ -1786,7 +1778,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
         { h->mb.pic.p_fref[0][a->l0.i_ref],
           h->mb.pic.p_fref[1][a->l1.i_ref] };
     DECLARE_ALIGNED_8( uint8_t pix[2][8*16] );
-    DECLARE_ALIGNED_8( int mvc[2][2] );
+    DECLARE_ALIGNED_4( int16_t mvc[2][2] );
     int i, l;
 
     h->mb.i_partition = D_8x16;
@@ -1808,8 +1800,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
 
-            *(uint64_t*)mvc[0] = *(uint64_t*)lX->me8x8[i].mv;
-            *(uint64_t*)mvc[1] = *(uint64_t*)lX->me8x8[i+2].mv;
+            *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
+            *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
 
             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
             x264_me_search( h, m, mvc, 2 );
@@ -2626,21 +2618,21 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
             {
                 case D_16x16:
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
                     break;
 
                 case D_16x8:
                     x264_macroblock_cache_ref( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].i_ref );
                     x264_macroblock_cache_ref( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv[0], a->l0.me16x8[0].mv[1] );
-                    x264_macroblock_cache_mv ( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv[0], a->l0.me16x8[1].mv[1] );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 2, 0, a->l0.me16x8[0].mv );
+                    x264_macroblock_cache_mv_ptr( h, 0, 2, 4, 2, 0, a->l0.me16x8[1].mv );
                     break;
 
                 case D_8x16:
                     x264_macroblock_cache_ref( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].i_ref );
                     x264_macroblock_cache_ref( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv[0], a->l0.me8x16[0].mv[1] );
-                    x264_macroblock_cache_mv ( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv[0], a->l0.me8x16[1].mv[1] );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 2, 4, 0, a->l0.me8x16[0].mv );
+                    x264_macroblock_cache_mv_ptr( h, 2, 0, 2, 4, 0, a->l0.me8x16[1].mv );
                     break;
 
                 default:
@@ -2662,8 +2654,7 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
         {
             h->mb.i_partition = D_16x16;
             x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, 0 );
-            x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv[0],
-                                                         h->mb.cache.pskip_mv[1] );
+            x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, h->mb.cache.pskip_mv );
             break;
         }
 
@@ -2689,26 +2680,26 @@ static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a  )
                 {
                 case B_L0_L0:
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
 
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, -1 );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1,  0, 0 );
-                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1,  0, 0 );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, 0 );
+                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 1, 0 );
                     break;
                 case B_L1_L1:
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, -1 );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0,  0, 0 );
-                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0,  0, 0 );
+                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, 0 );
+                    x264_macroblock_cache_mvd( h, 0, 0, 4, 4, 0, 0 );
 
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
                     break;
                 case B_BI_BI:
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv[0], a->l0.me16x16.mv[1] );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
 
                     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
-                    x264_macroblock_cache_mv ( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv[0], a->l1.me16x16.mv[1] );
+                    x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 1, a->l1.me16x16.mv );
                     break;
                 }
                 break;
diff --git a/encoder/cabac.c b/encoder/cabac.c
index 052c0e41..d482c066 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -484,7 +484,7 @@ static inline void x264_cabac_mb_mvd_cpn( x264_t *h, x264_cabac_t *cb, int i_lis
 
 static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width, int height )
 {
-    int mvp[2];
+    DECLARE_ALIGNED_4( int16_t mvp[2] );
     int mdx, mdy;
 
     /* Calculate mvd */
@@ -497,7 +497,7 @@ static inline void x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, i
     x264_cabac_mb_mvd_cpn( h, cb, i_list, idx, 1, mdy );
 
     /* save value */
-    x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, mdx, mdy );
+    x264_macroblock_cache_mvd( h, block_idx_x[idx], block_idx_y[idx], width, height, i_list, pack16to32_mask(mdx,mdy) );
 }
 
 static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int i )
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index 54bc567e..726d024f 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -232,7 +232,7 @@ static void cavlc_qp_delta( x264_t *h, bs_t *s )
 
 static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width )
 {
-    int mvp[2];
+    DECLARE_ALIGNED_4( int16_t mvp[2] );
     x264_mb_predict_mv( h, i_list, idx, width, mvp );
     bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
     bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
@@ -408,7 +408,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
     }
     else if( i_mb_type == P_L0 )
     {
-        int mvp[2];
+        DECLARE_ALIGNED_4( int16_t mvp[2] );
 
         if( h->mb.i_partition == D_16x16 )
         {
@@ -524,7 +524,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
         /* All B mode */
         /* Motion Vector */
         int i_list;
-        int mvp[2];
+        DECLARE_ALIGNED_4( int16_t mvp[2] );
 
         int b_list[2][2];
 
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 33547146..74c94130 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -585,17 +585,15 @@ void x264_macroblock_encode( x264_t *h )
     if( !b_force_no_skip )
     {
         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
-            h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma == 0x00 &&
-            h->mb.cache.mv[0][x264_scan8[0]][0] == h->mb.cache.pskip_mv[0] &&
-            h->mb.cache.mv[0][x264_scan8[0]][1] == h->mb.cache.pskip_mv[1] &&
-            h->mb.cache.ref[0][x264_scan8[0]] == 0 )
+            !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) && 
+            *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv
+            && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
         {
             h->mb.i_type = P_SKIP;
         }
 
         /* Check for B_SKIP */
-        if( h->mb.i_type == B_DIRECT &&
-            h->mb.i_cbp_luma == 0x00 && h->mb.i_cbp_chroma== 0x00 )
+        if( h->mb.i_type == B_DIRECT && !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) )
         {
             h->mb.i_type = B_SKIP;
         }
diff --git a/encoder/me.c b/encoder/me.c
index 7198957d..7598b76f 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -151,7 +151,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     }\
 }
 
-void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
+void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
 {
     const int bw = x264_pixel_size[m->i_pixel].w;
     const int bh = x264_pixel_size[m->i_pixel].h;
@@ -982,7 +982,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i8 )
     m->cost = bcost;
     m->mv[0] = bmx;
     m->mv[1] = bmy;
-    x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, 0, bmx, bmy );
-    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, bmx - pmx, bmy - pmy );
+    x264_macroblock_cache_mv ( h, 2*(i8&1), i8&2, bw, bh, 0, pack16to32_mask(bmx, bmy) );
+    x264_macroblock_cache_mvd( h, 2*(i8&1), i8&2, bw, bh, 0, pack16to32_mask(bmx - pmx, bmy - pmy) );
 }
 
diff --git a/encoder/me.h b/encoder/me.h
index 295dd14a..96135c9e 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -39,16 +39,16 @@ typedef struct
     uint16_t *integral;
     int      i_stride[2];
 
-    int mvp[2];
+    DECLARE_ALIGNED_4( int16_t mvp[2] );
 
     /* output */
     int cost_mv;        /* lambda * nbits for the chosen mv */
     int cost;           /* satd + lambda * nbits */
-    DECLARE_ALIGNED_8( int mv[2] );
+    DECLARE_ALIGNED_4( int16_t mv[2] );
 } x264_me_t;
 
-void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
-static inline void x264_me_search( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc )
+void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
+static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
     { x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
 
 void x264_me_refine_qpel( x264_t *h, x264_me_t *m );
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 8cee4f90..58e666be 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -153,9 +153,9 @@ int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     i_cost_bak = i_bcost;
     for( l = 0; l < 1 + b_bidir; l++ )
     {
-        int mvc[4][2] = {{0}}, i_mvc;
+        int16_t mvc[4][2] = {{0}};
+        int i_mvc = 0;
         int16_t (*fenc_mv)[2] = &fenc->mv[l][i_mb_xy];
-        i_mvc = 0;
 #define MVC(mv) { mvc[i_mvc][0] = mv[0]; mvc[i_mvc][1] = mv[1]; i_mvc++; }
         if( i_mb_x > 0 )
             MVC(fenc_mv[-1]);
-- 
2.40.0