From: Fiona Glaser <fiona@x264.com>
Date: Thu, 12 Nov 2009 13:25:32 +0000 (-0800)
Subject: Fix all aliasing violations
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=03cb8c09553f24bf800cd47893e48b0aa91f9313;p=libx264

Fix all aliasing violations
New type-punning macros perform write/read-combining without aliasing violations per the second-to-last part of 6.5.7 in the C99 specification.
GCC 4.4, however, doesn't seem to have read this part of the spec and still warns about the violations.
Regardless, it seems to fix all known aliasing miscompilations, so perhaps the GCC warning generator is just broken.
As such, add -Wno-strict-aliasing to CFLAGS.
---

diff --git a/common/bs.h b/common/bs.h
index 0c009921..afe6b5af 100644
--- a/common/bs.h
+++ b/common/bs.h
@@ -88,7 +88,7 @@ static inline int bs_pos( bs_t *s )
 /* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
 static inline void bs_flush( bs_t *s )
 {
-    *(uint32_t*)s->p = endian_fix32( s->cur_bits << (s->i_left&31) );
+    M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
     s->p += WORD_SIZE - s->i_left / 8;
     s->i_left = WORD_SIZE*8;
 }
@@ -102,9 +102,9 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
         if( s->i_left <= 32 )
         {
 #ifdef WORDS_BIGENDIAN
-            *(uint32_t*)s->p = s->cur_bits >> (32 - s->i_left);
+            M32( s->p ) = s->cur_bits >> (32 - s->i_left);
 #else
-            *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+            M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
 #endif
             s->i_left += 32;
             s->p += 4;
@@ -121,7 +121,7 @@ static inline void bs_write( bs_t *s, int i_count, uint32_t i_bits )
         {
             i_count -= s->i_left;
             s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
-            *(uint32_t*)s->p = endian_fix( s->cur_bits );
+            M32( s->p ) = endian_fix( s->cur_bits );
             s->p += 4;
             s->cur_bits = i_bits;
             s->i_left = 32 - i_count;
@@ -144,7 +144,7 @@ static inline void bs_write1( bs_t *s, uint32_t i_bit )
     s->i_left--;
     if( s->i_left == WORD_SIZE*8-32 )
     {
-        *(uint32_t*)s->p = endian_fix32( s->cur_bits );
+        M32( s->p ) = endian_fix32( s->cur_bits );
         s->p += 4;
         s->i_left = WORD_SIZE*8;
     }
diff --git a/common/common.h b/common/common.h
index 8bd71d35..3ea51553 100644
--- a/common/common.h
+++ b/common/common.h
@@ -78,6 +78,21 @@ do {\
 #include <string.h>
 #include <assert.h>
 #include <limits.h>
+
+/* Unions for type-punning without aliasing violations.
+ * Mn: load or store n bits, aligned, native-endian
+ * CPn: copy n bits, aligned, native-endian
+ * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
+typedef union { uint16_t i; uint8_t  c[2]; } x264_union16_t;
+typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } x264_union32_t;
+typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } x264_union64_t;
+#define M16(src) (((x264_union16_t*)(src))->i)
+#define M32(src) (((x264_union32_t*)(src))->i)
+#define M64(src) (((x264_union64_t*)(src))->i)
+#define CP16(dst,src) M16(dst) = M16(src)
+#define CP32(dst,src) M32(dst) = M32(src)
+#define CP64(dst,src) M64(dst) = M64(src)
+
 #include "x264.h"
 #include "bs.h"
 #include "set.h"
diff --git a/common/dct.c b/common/dct.c
index 0aed8d0f..245347b3 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -607,11 +607,11 @@ static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
 
 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
 {
-    *(uint32_t*)level = *(uint32_t*)dct;
+    CP32( level, dct );
     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
-    *(uint32_t*)(level+6) = *(uint32_t*)(dct+6);
-    *(uint64_t*)(level+8) = *(uint64_t*)(dct+8);
-    *(uint64_t*)(level+12) = *(uint64_t*)(dct+12);
+    CP32( level+6, dct+6 );
+    CP64( level+8, dct+8 );
+    CP64( level+12, dct+12 );
 }
 
 #undef ZIG
@@ -622,19 +622,19 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
     nz |= level[i];\
 }
 #define COPY4x4\
-    *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
-    *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
-    *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
-    *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
+    CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+    CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+    CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+    CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
 #define COPY8x8\
-    *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
-    *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
+    CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+    CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+    CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+    CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
+    CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
+    CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
+    CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
+    CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
 
 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 {
diff --git a/common/frame.c b/common/frame.c
index 4b4cc99c..0ae0e9a1 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -728,10 +728,10 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
         {\
             /* *** Get bS for each 4px for the current edge *** */\
             if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
-                *(uint32_t*)bS = 0x03030303;\
+                M32( bS ) = 0x03030303;\
             else\
             {\
-                *(uint32_t*)bS = 0x00000000;\
+                M32( bS ) = 0x00000000;\
                 for( i = 0; i < 4; i++ )\
                 {\
                     int x  = i_dir == 0 ? i_edge : i;\
@@ -805,7 +805,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
                     goto end##i_dir;\
                 }\
                 DEBLOCK_STRENGTH(i_dir);\
-                if( *(uint32_t*)bS )\
+                if( M32( bS ) )\
                     FILTER_DIR( , i_dir);\
                 end##i_dir:\
                 i_edge += b_8x8_transform+1;\
@@ -816,7 +816,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
             for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
             {\
                 DEBLOCK_STRENGTH(i_dir);\
-                if( *(uint32_t*)bS )\
+                if( M32( bS ) )\
                     FILTER_DIR( , i_dir);\
             }\
         }
diff --git a/common/macroblock.c b/common/macroblock.c
index c747241c..4468f4b8 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -50,7 +50,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refb == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_b;
+                CP32( mvp, mv_b );
                 return;
             }
         }
@@ -58,7 +58,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refa == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_a;
+                CP32( mvp, mv_a );
                 return;
             }
         }
@@ -69,7 +69,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refa == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_a;
+                CP32( mvp, mv_a );
                 return;
             }
         }
@@ -77,7 +77,7 @@ void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mv
         {
             if( i_refc == i_ref )
             {
-                *(uint32_t*)mvp = *(uint32_t*)mv_c;
+                CP32( mvp, mv_c );
                 return;
             }
         }
@@ -95,14 +95,14 @@ median:
     else if( i_count == 1 )
     {
         if( i_refa == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_a;
+            CP32( mvp, mv_a );
         else if( i_refb == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_b;
+            CP32( mvp, mv_b );
         else
-            *(uint32_t*)mvp = *(uint32_t*)mv_c;
+            CP32( mvp, mv_c );
     }
     else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-        *(uint32_t*)mvp = *(uint32_t*)mv_a;
+        CP32( mvp, mv_a );
     else
         goto median;
 }
@@ -136,14 +136,14 @@ median:
     else if( i_count == 1 )
     {
         if( i_refa == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_a;
+            CP32( mvp, mv_a );
         else if( i_refb == i_ref )
-            *(uint32_t*)mvp = *(uint32_t*)mv_b;
+            CP32( mvp, mv_b );
         else
-            *(uint32_t*)mvp = *(uint32_t*)mv_c;
+            CP32( mvp, mv_c );
     }
     else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
-        *(uint32_t*)mvp = *(uint32_t*)mv_a;
+        CP32( mvp, mv_a );
     else
         goto median;
 }
@@ -157,10 +157,10 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
     int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
 
     if( i_refa == -2 || i_refb == -2 ||
-        !( i_refa | *(uint32_t*)mv_a ) ||
-        !( i_refb | *(uint32_t*)mv_b ) )
+        !( i_refa | M32( mv_a ) ) ||
+        !( i_refb | M32( mv_b ) ) )
     {
-        *(uint32_t*)mv = 0;
+        M32( mv ) = 0;
     }
     else
     {
@@ -259,17 +259,12 @@ static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
     if( ref[0] >= 0 )
         x264_mb_predict_mv_16x16( h, 0, ref[0], mv[0] );
     else
-    {
-        mv[0][0] = 0;
-        mv[0][1] = 0;
-    }
+        M32( mv[0] ) = 0;
+
     if( ref[1] >= 0 )
         x264_mb_predict_mv_16x16( h, 1, ref[1], mv[1] );
     else
-    {
-        mv[1][0] = 0;
-        mv[1][1] = 0;
-    }
+        M32( mv[1] ) = 0;
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
@@ -336,8 +331,8 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
         {
             *b_changed = h->mb.cache.direct_ref[0][0] != h->mb.cache.ref[0][X264_SCAN8_0]
                       || h->mb.cache.direct_ref[1][0] != h->mb.cache.ref[1][X264_SCAN8_0]
-                      || *(uint32_t*)h->mb.cache.direct_mv[0][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[0][X264_SCAN8_0]
-                      || *(uint32_t*)h->mb.cache.direct_mv[1][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[1][X264_SCAN8_0];
+                      || M32( h->mb.cache.direct_mv[0][X264_SCAN8_0] ) != M32( h->mb.cache.mv[0][X264_SCAN8_0] )
+                      || M32( h->mb.cache.direct_mv[1][X264_SCAN8_0] ) != M32( h->mb.cache.mv[1][X264_SCAN8_0] );
         }
         else
         {
@@ -371,14 +366,10 @@ void x264_mb_load_mv_direct8x8( x264_t *h, int idx )
     const int y = 2*(idx/2);
     x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
     x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
-    *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]] =
-    *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]];
-    *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]+8] =
-    *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8];
-    *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]] =
-    *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]];
-    *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]+8] =
-    *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8];
+    CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+0] );
+    CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8] );
+    CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+0] );
+    CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8] );
 }
 
 /* This just improves encoder performance, it's not part of the spec */
@@ -388,7 +379,7 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
     int i = 0;
 
 #define SET_MVP(mvp) { \
-        *(uint32_t*)mvc[i] = *(uint32_t*)mvp; \
+        CP32( mvc[i], mvp ); \
         i++; \
     }
 
@@ -403,7 +394,11 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int16_t mvc[
     {
         int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1]
                                          : h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1];
-        if( lowres_mv[0][0] != 0x7fff ) *(uint32_t*)mvc[i++] = (*(uint32_t*)lowres_mv[h->mb.i_mb_xy]*2)&0xfffeffff;
+        if( lowres_mv[0][0] != 0x7fff )
+        {
+            M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
+            i++;
+        }
     }
 
     /* spatial predictors */
@@ -982,13 +977,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
             h->mb.i_neighbour_intra |= MB_TOP;
 
         /* load intra4x4 */
-        *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
+        CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[i_top_xy][0] );
 
         /* load non_zero_count */
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
+        CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[i_top_xy][12] );
         /* shift because x264_scan8[16] is misaligned */
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][18] ) << 8;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][22] ) << 8;
     }
     else
     {
@@ -996,12 +991,12 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
         h->mb.cache.i_cbp_top = -1;
 
         /* load intra4x4 */
-        *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = 0xFFFFFFFFU;
+        M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU;
 
         /* load non_zero_count */
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] =
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] =
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[   0] - 8] ) = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = 0x80808080U;
+        M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = 0x80808080U;
     }
 
     if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
@@ -1136,13 +1131,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 const int ir = i_top_8x8 - 1;
                 const int iv = i_top_4x4 - 1;
                 h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+                CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
             }
             else
             {
                 const int i8 = x264_scan8[0] - 1 - 1*8;
                 h->mb.cache.ref[i_list][i8] = -2;
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+                M32( h->mb.cache.mv[i_list][i8] ) = 0;
             }
 
             if( h->mb.i_neighbour & MB_TOP )
@@ -1154,15 +1149,15 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0];
                 h->mb.cache.ref[i_list][i8+2] =
                 h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1];
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = *(uint64_t*)h->mb.mv[i_list][iv+0];
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = *(uint64_t*)h->mb.mv[i_list][iv+2];
+                CP64( h->mb.cache.mv[i_list][i8+0], h->mb.mv[i_list][iv+0] );
+                CP64( h->mb.cache.mv[i_list][i8+2], h->mb.mv[i_list][iv+2] );
             }
             else
             {
                 const int i8 = x264_scan8[0] - 8;
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = 0;
-                *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = 0;
-                *(uint32_t*)&h->mb.cache.ref[i_list][i8] = (uint8_t)(-2) * 0x01010101U;
+                M64( h->mb.cache.mv[i_list][i8+0] ) = 0;
+                M64( h->mb.cache.mv[i_list][i8+2] ) = 0;
+                M32( &h->mb.cache.ref[i_list][i8] ) = (uint8_t)(-2) * 0x01010101U;
             }
 
             if( h->mb.i_neighbour & MB_TOPRIGHT )
@@ -1171,13 +1166,13 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 const int ir = i_top_8x8 + 2;
                 const int iv = i_top_4x4 + 4;
                 h->mb.cache.ref[i_list][i8]  = h->mb.ref[i_list][ir];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+                CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
             }
             else
             {
                 const int i8 = x264_scan8[0] + 4 - 1*8;
                 h->mb.cache.ref[i_list][i8] = -2;
-                *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+                M32( h->mb.cache.mv[i_list][i8] ) = 0;
             }
 
             if( h->mb.i_neighbour & MB_LEFT )
@@ -1190,10 +1185,10 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 h->mb.cache.ref[i_list][i8+2*8] =
                 h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
 
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+0*8] = *(uint32_t*)h->mb.mv[i_list][iv + 0*s4x4];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+1*8] = *(uint32_t*)h->mb.mv[i_list][iv + 1*s4x4];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+2*8] = *(uint32_t*)h->mb.mv[i_list][iv + 2*s4x4];
-                *(uint32_t*)h->mb.cache.mv[i_list][i8+3*8] = *(uint32_t*)h->mb.mv[i_list][iv + 3*s4x4];
+                CP32( h->mb.cache.mv[i_list][i8+0*8], h->mb.mv[i_list][iv + 0*s4x4] );
+                CP32( h->mb.cache.mv[i_list][i8+1*8], h->mb.mv[i_list][iv + 1*s4x4] );
+                CP32( h->mb.cache.mv[i_list][i8+2*8], h->mb.mv[i_list][iv + 2*s4x4] );
+                CP32( h->mb.cache.mv[i_list][i8+3*8], h->mb.mv[i_list][iv + 3*s4x4] );
             }
             else
             {
@@ -1201,7 +1196,7 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 for( i = 0; i < 4; i++ )
                 {
                     h->mb.cache.ref[i_list][i8+i*8] = -2;
-                    *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = 0;
+                    M32( h->mb.cache.mv[i_list][i8+i*8] ) = 0;
                 }
             }
 
@@ -1211,30 +1206,30 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y )
                 {
                     const int i8 = x264_scan8[0] - 8;
                     const int iv = i_top_4x4;
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = *(uint64_t*)h->mb.mvd[i_list][iv+0];
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = *(uint64_t*)h->mb.mvd[i_list][iv+2];
+                    CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
+                    CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
                 }
                 else
                 {
                     const int i8 = x264_scan8[0] - 8;
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] =
-                    *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = 0;
+                    M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
+                    M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
                 }
 
                 if( i_left_type >= 0 )
                 {
                     const int i8 = x264_scan8[0] - 1;
                     const int iv = i_mb_4x4 - 1;
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+0*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 0*s4x4];
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+1*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 1*s4x4];
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+2*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 2*s4x4];
-                    *(uint32_t*)h->mb.cache.mvd[i_list][i8+3*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 3*s4x4];
+                    CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
+                    CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
+                    CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
+                    CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
                 }
                 else
                 {
                     const int i8 = x264_scan8[0] - 1;
                     for( i = 0; i < 4; i++ )
-                        *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = 0;
+                        M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
                 }
             }
         }
@@ -1311,15 +1306,15 @@ void x264_macroblock_cache_save( x264_t *h )
     /* save intra4x4 */
     if( i_mb_type == I_4x4 )
     {
-        *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
-        *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
-                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
-                                                       h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
+        CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
+        M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+                                                  h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
     }
     else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
-        *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
+        M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
     else
-        *(uint64_t*)intra4x4_pred_mode = (uint8_t)(-1) * 0x0101010101010101ULL;
+        M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
 
 
     if( i_mb_type == I_PCM )
@@ -1335,14 +1330,14 @@ void x264_macroblock_cache_save( x264_t *h )
     else
     {
         /* save non zero count */
-        *(uint32_t*)&non_zero_count[0*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+0*8];
-        *(uint32_t*)&non_zero_count[1*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+1*8];
-        *(uint32_t*)&non_zero_count[2*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+2*8];
-        *(uint32_t*)&non_zero_count[3*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+3*8];
-        *(uint16_t*)&non_zero_count[16+0*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] >> 8;
-        *(uint16_t*)&non_zero_count[16+1*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] >> 8;
-        *(uint16_t*)&non_zero_count[16+2*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] >> 8;
-        *(uint16_t*)&non_zero_count[16+3*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] >> 8;
+        CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
+        CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
+        CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
+        CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
+        M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
+        M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
+        M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
+        M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
 
         if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
             h->mb.i_qp = h->mb.i_last_qp;
@@ -1365,8 +1360,8 @@ void x264_macroblock_cache_save( x264_t *h )
             h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
-                *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
+                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
             }
             if( h->sh.i_type == SLICE_TYPE_B )
             {
@@ -1376,8 +1371,8 @@ void x264_macroblock_cache_save( x264_t *h )
                 h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
-                    *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
+                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
+                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
                 }
             }
         }
@@ -1386,12 +1381,12 @@ void x264_macroblock_cache_save( x264_t *h )
             int i_list;
             for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
             {
-                *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
-                *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
+                M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+                M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
-                    *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0;
+                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
+                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
                 }
             }
         }
@@ -1408,28 +1403,28 @@ void x264_macroblock_cache_save( x264_t *h )
         {
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
+                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
+                CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
             }
             if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
+                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
+                    CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
                 }
         }
         else
         {
             for( y = 0; y < 4; y++ )
             {
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
-                *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
+                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
+                M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
             }
             if( h->sh.i_type == SLICE_TYPE_B )
                 for( y = 0; y < 4; y++ )
                 {
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
-                    *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
+                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
+                    M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
                 }
         }
 
diff --git a/common/macroblock.h b/common/macroblock.h
index 1afc73ec..5aaf3a68 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -338,21 +338,22 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
 }
 static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val )
 {
+    uint32_t *d = dst;
     if( width == 4 )
     {
         uint32_t val2 = val * 0x01010101;
-                          ((uint32_t*)dst)[0] = val2;
-        if( height >= 2 ) ((uint32_t*)dst)[2] = val2;
-        if( height == 4 ) ((uint32_t*)dst)[4] = val2;
-        if( height == 4 ) ((uint32_t*)dst)[6] = val2;
+                          M32( d+0 ) = val2;
+        if( height >= 2 ) M32( d+2 ) = val2;
+        if( height == 4 ) M32( d+4 ) = val2;
+        if( height == 4 ) M32( d+6 ) = val2;
     }
     else // 2
     {
         uint32_t val2 = val * 0x0101;
-                          ((uint16_t*)dst)[ 0] = val2;
-        if( height >= 2 ) ((uint16_t*)dst)[ 4] = val2;
-        if( height == 4 ) ((uint16_t*)dst)[ 8] = val2;
-        if( height == 4 ) ((uint16_t*)dst)[12] = val2;
+                          M16( d+0 ) = val2;
+        if( height >= 2 ) M16( d+2 ) = val2;
+        if( height == 4 ) M16( d+4 ) = val2;
+        if( height == 4 ) M16( d+6 ) = val2;
     }
 }
 static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
@@ -360,25 +361,27 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int
     int dy;
     if( width == 1 || WORD_SIZE < 8 )
     {
+        uint32_t *d = dst;
         for( dy = 0; dy < height; dy++ )
         {
-                             ((uint32_t*)dst)[8*dy+0] = val;
-            if( width >= 2 ) ((uint32_t*)dst)[8*dy+1] = val;
-            if( width == 4 ) ((uint32_t*)dst)[8*dy+2] = val;
-            if( width == 4 ) ((uint32_t*)dst)[8*dy+3] = val;
+                             M32( d+8*dy+0 ) = val;
+            if( width >= 2 ) M32( d+8*dy+1 ) = val;
+            if( width == 4 ) M32( d+8*dy+2 ) = val;
+            if( width == 4 ) M32( d+8*dy+3 ) = val;
         }
     }
     else
     {
         uint64_t val64 = val + ((uint64_t)val<<32);
+        uint64_t *d = dst;
         for( dy = 0; dy < height; dy++ )
         {
-                             ((uint64_t*)dst)[4*dy+0] = val64;
-            if( width == 4 ) ((uint64_t*)dst)[4*dy+1] = val64;
+                             M64( d+4*dy+0 ) = val64;
+            if( width == 4 ) M64( d+4*dy+1 ) = val64;
         }
     }
 }
-#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv)
+#define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) )
 static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
 {
     x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
@@ -462,7 +465,7 @@ static inline int x264_mb_transform_8x8_allowed( x264_t *h )
         return 0;
     if( h->mb.i_type != P_8x8 )
         return partition_tab[h->mb.i_type];
-    return *(uint32_t*)h->mb.i_sub_partition == D_L0_8x8*0x01010101;
+    return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
 }
 
 #endif
diff --git a/common/predict.c b/common/predict.c
index 385eb5ce..119458d0 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -44,11 +44,10 @@
 #define PREDICT_16x16_DC(v) \
     for( i = 0; i < 16; i++ )\
     {\
-        uint32_t *p = (uint32_t*)src;\
-        *p++ = v;\
-        *p++ = v;\
-        *p++ = v;\
-        *p++ = v;\
+        M32( src+ 0 ) = v;\
+        M32( src+ 4 ) = v;\
+        M32( src+ 8 ) = v;\
+        M32( src+12 ) = v;\
         src += FDEC_STRIDE;\
     }
 
@@ -104,32 +103,28 @@ static void predict_16x16_h( uint8_t *src )
     for( i = 0; i < 16; i++ )
     {
         const uint32_t v = 0x01010101 * src[-1];
-        uint32_t *p = (uint32_t*)src;
-
-        *p++ = v;
-        *p++ = v;
-        *p++ = v;
-        *p++ = v;
-
+        M32( src+ 0 ) = v;
+        M32( src+ 4 ) = v;
+        M32( src+ 8 ) = v;
+        M32( src+12 ) = v;
         src += FDEC_STRIDE;
 
     }
 }
 static void predict_16x16_v( uint8_t *src )
 {
-    uint32_t v0 = *(uint32_t*)&src[ 0-FDEC_STRIDE];
-    uint32_t v1 = *(uint32_t*)&src[ 4-FDEC_STRIDE];
-    uint32_t v2 = *(uint32_t*)&src[ 8-FDEC_STRIDE];
-    uint32_t v3 = *(uint32_t*)&src[12-FDEC_STRIDE];
+    uint32_t v0 = M32( &src[ 0-FDEC_STRIDE] );
+    uint32_t v1 = M32( &src[ 4-FDEC_STRIDE] );
+    uint32_t v2 = M32( &src[ 8-FDEC_STRIDE] );
+    uint32_t v3 = M32( &src[12-FDEC_STRIDE] );
     int i;
 
     for( i = 0; i < 16; i++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = v0;
-        *p++ = v1;
-        *p++ = v2;
-        *p++ = v3;
+        M32( src+ 0 ) = v0;
+        M32( src+ 4 ) = v1;
+        M32( src+ 8 ) = v2;
+        M32( src+12 ) = v3;
         src += FDEC_STRIDE;
     }
 }
@@ -178,9 +173,8 @@ static void predict_8x8c_dc_128( uint8_t *src )
 
     for( y = 0; y < 8; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = 0x80808080;
-        *p++ = 0x80808080;
+        M32( src+0 ) = 0x80808080;
+        M32( src+4 ) = 0x80808080;
         src += FDEC_STRIDE;
     }
 }
@@ -199,16 +193,14 @@ static void predict_8x8c_dc_left( uint8_t *src )
 
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc0;
-        *p++ = dc0;
+        M32( src+0 ) = dc0;
+        M32( src+4 ) = dc0;
         src += FDEC_STRIDE;
     }
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc1;
-        *p++ = dc1;
+        M32( src+0 ) = dc1;
+        M32( src+4 ) = dc1;
         src += FDEC_STRIDE;
     }
 
@@ -228,9 +220,8 @@ static void predict_8x8c_dc_top( uint8_t *src )
 
     for( y = 0; y < 8; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc0;
-        *p++ = dc1;
+        M32( src+0 ) = dc0;
+        M32( src+4 ) = dc1;
         src += FDEC_STRIDE;
     }
 }
@@ -264,17 +255,15 @@ static void predict_8x8c_dc( uint8_t *src )
 
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc0;
-        *p++ = dc1;
+        M32( src+0 ) = dc0;
+        M32( src+4 ) = dc1;
         src += FDEC_STRIDE;
     }
 
     for( y = 0; y < 4; y++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = dc2;
-        *p++ = dc3;
+        M32( src+0 ) = dc2;
+        M32( src+4 ) = dc3;
         src += FDEC_STRIDE;
     }
 }
@@ -285,23 +274,21 @@ static void predict_8x8c_h( uint8_t *src )
     for( i = 0; i < 8; i++ )
     {
         uint32_t v = 0x01010101 * src[-1];
-        uint32_t *p = (uint32_t*)src;
-        *p++ = v;
-        *p++ = v;
+        M32( src+0 ) = v;
+        M32( src+4 ) = v;
         src += FDEC_STRIDE;
     }
 }
 static void predict_8x8c_v( uint8_t *src )
 {
-    uint32_t v0 = *(uint32_t*)&src[0-FDEC_STRIDE];
-    uint32_t v1 = *(uint32_t*)&src[4-FDEC_STRIDE];
+    uint32_t v0 = M32( src+0-FDEC_STRIDE );
+    uint32_t v1 = M32( src+4-FDEC_STRIDE );
     int i;
 
     for( i = 0; i < 8; i++ )
     {
-        uint32_t *p = (uint32_t*)src;
-        *p++ = v0;
-        *p++ = v1;
+        M32( src+0 ) = v0;
+        M32( src+4 ) = v1;
         src += FDEC_STRIDE;
     }
 }
@@ -343,7 +330,7 @@ static void predict_8x8c_p( uint8_t *src )
  ****************************************************************************/
 
 #define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-#define SRC32(x,y) *(uint32_t*)&SRC(x,y)
+#define SRC32(x,y) M32( &SRC(x,y) )
 
 #define PREDICT_4x4_DC(v)\
     SRC32(0,0) = SRC32(0,1) = SRC32(0,2) = SRC32(0,3) = v;
@@ -535,7 +522,7 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor,
             }
             else
             {
-                *(uint64_t*)(edge+24) = SRC(7,-1) * 0x0101010101010101ULL;
+                M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL;
                 edge[32] = SRC(7,-1);
             }
         }
@@ -561,8 +548,8 @@ static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor,
 #define PREDICT_8x8_DC(v) \
     int y; \
     for( y = 0; y < 8; y++ ) { \
-        ((uint32_t*)src)[0] = \
-        ((uint32_t*)src)[1] = v; \
+        M32( src+0 ) = v; \
+        M32( src+4 ) = v; \
         src += FDEC_STRIDE; \
     }
 
@@ -593,17 +580,17 @@ static void predict_8x8_dc( uint8_t *src, uint8_t edge[33] )
 static void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
 {
     PREDICT_8x8_LOAD_LEFT
-#define ROW(y) ((uint32_t*)(src+y*FDEC_STRIDE))[0] =\
-               ((uint32_t*)(src+y*FDEC_STRIDE))[1] = 0x01010101U * l##y
+#define ROW(y) M32( src+y*FDEC_STRIDE+0 ) =\
+               M32( src+y*FDEC_STRIDE+4 ) = 0x01010101U * l##y;
     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
 #undef ROW
 }
 static void predict_8x8_v( uint8_t *src, uint8_t edge[33] )
 {
-    const uint64_t top = *(uint64_t*)(edge+16);
+    const uint64_t top = M64( edge+16 );
     int y;
     for( y = 0; y < 8; y++ )
-        *(uint64_t*)(src+y*FDEC_STRIDE) = top;
+        M64( src+y*FDEC_STRIDE ) = top;
 }
 static void predict_8x8_ddl( uint8_t *src, uint8_t edge[33] )
 {
diff --git a/common/quant.c b/common/quant.c
index 096a4b34..7434a3d8 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -178,7 +178,7 @@ static int ALWAYS_INLINE x264_decimate_score_internal( int16_t *dct, int i_max )
     int idx = i_max - 1;
 
     /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned.  idx>=0 instead of 1 works correctly for the same reason */
-    while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 )
+    while( idx >= 0 && M32( &dct[idx-1] ) == 0 )
         idx -= 2;
     if( idx >= 0 && dct[idx] == 0 )
         idx--;
@@ -218,7 +218,7 @@ static int ALWAYS_INLINE x264_coeff_last_internal( int16_t *l, int i_count )
 {
     int i_last;
     for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
-        if( *(uint64_t*)(l+i_last-3) )
+        if( M64( l+i_last-3 ) )
             break;
     while( i_last >= 0 && l[i_last] == 0 )
         i_last--;
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 5cfa6fd4..602ddcdc 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -266,12 +266,12 @@ static void predict_8x8c_dc_left( uint8_t *src )
 
     for( y = 0; y < 4; y++ )
     {
-        *(uint64_t*)src = dc0;
+        M64( src ) = dc0;
         src += FDEC_STRIDE;
     }
     for( y = 0; y < 4; y++ )
     {
-        *(uint64_t*)src = dc1;
+        M64( src ) = dc1;
         src += FDEC_STRIDE;
     }
 
@@ -296,8 +296,8 @@ static void predict_8x8c_dc_left( uint8_t *src )
 #define PREDICT_8x8_DC(v) \
     int y; \
     for( y = 0; y < 8; y++ ) { \
-        ((uint32_t*)src)[0] = \
-        ((uint32_t*)src)[1] = v; \
+        M32( src+0 ) = v; \
+        M32( src+4 ) = v; \
         src += FDEC_STRIDE; \
     }
 
diff --git a/common/x86/util.h b/common/x86/util.h
index bfb2ec8d..9d7579e7 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -38,8 +38,8 @@ static inline void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b,
         "pminsw %%mm2, %%mm0 \n"
         "pmaxsw %%mm1, %%mm0 \n"
         "movd   %%mm0, %0    \n"
-        :"=m"(*(uint32_t*)dst)
-        :"m"(*(uint32_t*)a), "m"(*(uint32_t*)b), "m"(*(uint32_t*)c)
+        :"=m"(*(x264_union32_t*)dst)
+        :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
     );
 }
 #define x264_predictor_difference x264_predictor_difference_mmxext
@@ -69,7 +69,7 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
         "jg 1b                \n"
         "movq    %%mm4, %0    \n"
         :"=m"(output), "+r"(i_mvc)
-        :"r"(mvc), "m"(*(struct {int16_t x[4];} *)mvc)
+        :"r"(mvc), "m"(M64( mvc ))
     );
     sum += output[0] + output[1] + output[2] + output[3];
     return sum;
@@ -98,7 +98,7 @@ static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16
         "pminsw    %5, %%mm0 \n"
         "movd   %%mm0, %0    \n"
         :"=r"(amvd)
-        :"m"(*(uint32_t*)mvdleft),"m"(*(uint32_t*)mvdtop),
+        :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
          "m"(pw_28),"m"(pw_2184),"m"(pw_2)
     );
     return amvd;
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 92737888..07cf5a7c 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -874,10 +874,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             if( h->mb.i_skip_intra )
             {
                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
-                h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
-                h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
-                h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
-                h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+                h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+                h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+                h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
                 h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
                 if( h->mb.i_skip_intra == 2 )
                     h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
@@ -918,7 +918,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
 
             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
-                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+                M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 
             if( b_merged_satd && i_max >= 6 )
             {
@@ -964,10 +964,10 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
             if( h->mb.i_skip_intra )
             {
                 h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
-                h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
-                h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
-                h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
-                h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+                h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+                h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+                h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+                h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
                 h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
                 if( h->mb.i_skip_intra == 2 )
                     h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
@@ -1092,7 +1092,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
 
             if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
-                *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+                M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
 
             for( i = 0; i < i_max; i++ )
             {
@@ -1107,18 +1107,18 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                 {
                     a->i_predict4x4[idx] = i_mode;
                     i_best = i_satd;
-                    pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
-                    pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
-                    pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
-                    pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
+                    pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
+                    pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
+                    pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
+                    pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
                     i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
                 }
             }
 
-            *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
-            *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
-            *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
-            *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
+            M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
+            M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
+            M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
+            M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
             h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
 
             h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
@@ -1163,21 +1163,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
                     cbp_luma_new = h->mb.i_cbp_luma;
                     i_best = i_satd;
 
-                    pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
+                    pels_h = M64( p_dst_by+7*FDEC_STRIDE );
                     if( !(idx&1) )
                         for( j=0; j<7; j++ )
                             pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
-                    i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
-                    i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
+                    i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
+                    i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
                 }
             }
             a->i_cbp_i8x8_luma = cbp_luma_new;
-            *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
+            M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
             if( !(idx&1) )
                 for( j=0; j<7; j++ )
                     p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
-            *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
+            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
+            M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
 
             x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
         }
@@ -1259,8 +1259,8 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
             h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
 
         /* save mv for predicting neighbors */
-        *(uint32_t*)a->l0.mvc[i_ref][0] =
-        *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+        CP32( a->l0.mvc[i_ref][0], m.mv );
+        CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
     }
 
     x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
@@ -1270,7 +1270,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     if( a->i_mbrd )
     {
         x264_mb_cache_fenc_satd( h );
-        if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+        if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) )
         {
             h->mb.i_partition = D_16x16;
             x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
@@ -1308,7 +1308,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
     }
 
     for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
-         *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
+        CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
 
     for( i = 0; i < 4; i++ )
     {
@@ -1335,7 +1335,7 @@ static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
 
             m.cost += i_ref_cost;
             i_halfpel_thresh += i_ref_cost;
-            *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
+            CP32( a->l0.mvc[i_ref][i+1], m.mv );
 
             if( m.cost < l0m->cost )
                 h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
@@ -1372,7 +1372,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
     h->mb.i_partition = D_8x8;
 
     i_mvc = 1;
-    *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
+    CP32( mvc[0], a->l0.me16x16.mv );
 
     for( i = 0; i < 4; i++ )
     {
@@ -1392,7 +1392,7 @@ static void x264_mb_analyse_inter_p8x8( x264_t *h, x264_mb_analysis_t *a )
 
         x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
 
-        *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
+        CP32( mvc[i_mvc], m->mv );
         i_mvc++;
 
         /* mb type cost */
@@ -1438,9 +1438,9 @@ static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a )
             m.i_ref_cost = i_ref_cost;
 
             /* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
-            *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
-            *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
-            *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
+            CP32( mvc[0], a->l0.mvc[i_ref][0] );
+            CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
+            CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
 
             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
@@ -1487,9 +1487,9 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a )
             const int i_ref_cost = REF_COST( 0, i_ref );
             m.i_ref_cost = i_ref_cost;
 
-            *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
-            *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
-            *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
+            CP32( mvc[0], a->l0.mvc[i_ref][0] );
+            CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
+            CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
 
             LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
             LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
@@ -1731,7 +1731,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         }
 
         /* save mv for predicting neighbors */
-        *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+        CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
     }
     a->l0.me16x16.i_ref = a->l0.i_ref;
 
@@ -1760,7 +1760,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
         }
 
         /* save mv for predicting neighbors */
-        *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+        CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
     }
     a->l1.me16x16.i_ref = a->l1.i_ref;
 
@@ -1972,8 +1972,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
             LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
 
-            *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
-            *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
+            CP32( mvc[0], lX->me8x8[2*i].mv );
+            CP32( mvc[1], lX->me8x8[2*i+1].mv );
 
             x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
             x264_me_search( h, m, mvc, 2 );
@@ -2040,8 +2040,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
             LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
             LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
 
-            *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
-            *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
+            CP32( mvc[0], lX->me8x8[i].mv );
+            CP32( mvc[1], lX->me8x8[i+2].mv );
 
             x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
             x264_me_search( h, m, mvc, 2 );
@@ -2995,7 +2995,7 @@ void x264_macroblock_analyse( x264_t *h )
         static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
         int list = check_mv_lists[h->mb.i_type] - 1;
         if( list >= 0 && h->mb.i_partition != D_16x16 &&
-            *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[0]] == *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[12]] &&
+            M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
             h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
                 h->mb.i_partition = D_16x16;
     }
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 2ddbc473..0b244a7c 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -155,8 +155,16 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
 
 #define STORE_8x8_NNZ(idx,nz)\
 {\
-    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
-    *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
+    M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
+}
+
+#define CLEAR_16x16_NNZ \
+{\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
+    M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
 }
 
 void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
@@ -244,10 +252,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
     if( decimate_score < 6 )
     {
         h->mb.i_cbp_luma = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
-        *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+        CLEAR_16x16_NNZ
     }
 
     h->dctf.dct4x4dc( dct_dc4x4 );
@@ -661,10 +666,10 @@ void x264_macroblock_encode( x264_t *h )
         if( h->mb.i_skip_intra )
         {
             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
             h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
             /* In RD mode, restore the now-overwritten DCT data. */
             if( h->mb.i_skip_intra == 2 )
@@ -691,10 +696,10 @@ void x264_macroblock_encode( x264_t *h )
         if( h->mb.i_skip_intra )
         {
             h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
-            *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
+            M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
             h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
             /* In RD mode, restore the now-overwritten DCT data. */
             if( h->mb.i_skip_intra == 2 )
@@ -707,7 +712,7 @@ void x264_macroblock_encode( x264_t *h )
 
             if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
                 /* emulate missing topright samples */
-                *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
+                M32( &p_dst[4-FDEC_STRIDE] ) = p_dst[3-FDEC_STRIDE] * 0x01010101U;
 
             if( h->mb.b_lossless )
                 x264_predict_lossless_4x4( h, p_dst, i, i_mode );
@@ -779,10 +784,7 @@ void x264_macroblock_encode( x264_t *h )
             if( i_decimate_mb < 6 && b_decimate )
             {
                 h->mb.i_cbp_luma = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
-                *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+                CLEAR_16x16_NNZ
             }
             else
             {
@@ -851,10 +853,7 @@ void x264_macroblock_encode( x264_t *h )
                 if( i_decimate_mb < 6 )
                 {
                     h->mb.i_cbp_luma = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
-                    *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+                    CLEAR_16x16_NNZ
                 }
                 else
                 {
@@ -899,7 +898,7 @@ void x264_macroblock_encode( x264_t *h )
     {
         if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
             !(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
-            *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv
+            M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
             && h->mb.cache.ref[0][x264_scan8[0]] == 0 )
         {
             h->mb.i_type = P_SKIP;
diff --git a/encoder/me.c b/encoder/me.c
index 4828d5ba..999cd4fc 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -211,7 +211,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
         COST_MV_HPEL( bmx, bmy );
         for( i = 0; i < i_mvc; i++ )
         {
-            if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) )
+            if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
             {
                 int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
                 int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
@@ -643,7 +643,7 @@ me_hex2:
                     {
                         /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
                         if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
-                            *(uint64_t*)&mvsads[i] = *(uint64_t*)&mvsads[j];
+                            CP64( &mvsads[i], &mvsads[j] );
                         else
                             mvsads[i] = mvsads[j];
                         i += mvsads[j].sad <= sad_thresh;
@@ -659,7 +659,7 @@ me_hex2:
                     nmvsad--;
                     mvsads[bi] = mvsads[nmvsad];
                     if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
-                        *(uint64_t*)&mvsads[bi] = *(uint64_t*)&mvsads[nmvsad];
+                        CP64( &mvsads[bi], &mvsads[nmvsad] );
                     else
                         mvsads[bi] = mvsads[nmvsad];
                 }
@@ -974,8 +974,10 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
                     if( cost < bcost * SATD_THRESH )
                     {
                         bcost = X264_MIN( cost, bcost );
-                        *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y);
-                        *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y);
+                        M32( cache0_mv  ) = pack16to32_mask(m0x,m0y);
+                        M32( cache0_mv2 ) = pack16to32_mask(m0x,m0y);
+                        M32( cache1_mv  ) = pack16to32_mask(m1x,m1y);
+                        M32( cache1_mv2 ) = pack16to32_mask(m1x,m1y);
                         h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
                         h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
                         uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
@@ -1038,7 +1040,8 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
     if( satd <= bsatd * SATD_THRESH ) \
     { \
         uint64_t cost; \
-        *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
+        M32( cache_mv  ) = pack16to32_mask(mx,my); \
+        M32( cache_mv2 ) = pack16to32_mask(mx,my); \
         cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
         COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
     } \
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 615510a0..dfd8ae10 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -373,10 +373,10 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             ALIGNED_4( int16_t mvc[4][2] );
 
             /* Reverse-order MV prediction. */
-            *(uint32_t*)mvc[0] = 0;
-            *(uint32_t*)mvc[1] = 0;
-            *(uint32_t*)mvc[2] = 0;
-#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
+            M32( mvc[0] ) = 0;
+            M32( mvc[1] ) = 0;
+            M32( mvc[2] ) = 0;
+#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
             if( i_mb_x < h->sps->i_mb_width - 1 )
                 MVC(fenc_mv[1]);
             if( i_mb_y < h->sps->i_mb_height - 1 )
@@ -392,20 +392,20 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             x264_me_search( h, &m[l], mvc, i_mvc );
 
             m[l].cost -= 2; // remove mvcost from skip mbs
-            if( *(uint32_t*)m[l].mv )
+            if( M32( m[l].mv ) )
                 m[l].cost += 5;
-            *(uint32_t*)fenc_mvs[l] = *(uint32_t*)m[l].mv;
+            CP32( fenc_mvs[l], m[l].mv );
             *fenc_costs[l] = m[l].cost;
         }
         else
         {
-            *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l];
+            CP32( m[l].mv, fenc_mvs[l] );
             m[l].cost = *fenc_costs[l];
         }
         COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
     }
 
-    if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
+    if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
         TRY_BIDIR( m[0].mv, m[1].mv, 5 );
 
     /* Store to width-2 bitfield. */