New type-punning macros perform write/read-combining without aliasing violations per the second-to-last part of 6.5.7 in the C99 specification.
GCC 4.4, however, doesn't seem to have read this part of the spec and still warns about the violations.
Regardless, it seems to fix all known aliasing miscompilations, so perhaps the GCC warning generator is just broken.
As such, add -Wno-strict-aliasing to CFLAGS.
/* Write the rest of cur_bits to the bitstream; results in a bitstream no longer 32-bit aligned. */
static inline void bs_flush( bs_t *s )
{
- *(uint32_t*)s->p = endian_fix32( s->cur_bits << (s->i_left&31) );
+ M32( s->p ) = endian_fix32( s->cur_bits << (s->i_left&31) );
s->p += WORD_SIZE - s->i_left / 8;
s->i_left = WORD_SIZE*8;
}
if( s->i_left <= 32 )
{
#ifdef WORDS_BIGENDIAN
- *(uint32_t*)s->p = s->cur_bits >> (32 - s->i_left);
+ M32( s->p ) = s->cur_bits >> (32 - s->i_left);
#else
- *(uint32_t*)s->p = endian_fix( s->cur_bits << s->i_left );
+ M32( s->p ) = endian_fix( s->cur_bits << s->i_left );
#endif
s->i_left += 32;
s->p += 4;
{
i_count -= s->i_left;
s->cur_bits = (s->cur_bits << s->i_left) | (i_bits >> i_count);
- *(uint32_t*)s->p = endian_fix( s->cur_bits );
+ M32( s->p ) = endian_fix( s->cur_bits );
s->p += 4;
s->cur_bits = i_bits;
s->i_left = 32 - i_count;
s->i_left--;
if( s->i_left == WORD_SIZE*8-32 )
{
- *(uint32_t*)s->p = endian_fix32( s->cur_bits );
+ M32( s->p ) = endian_fix32( s->cur_bits );
s->p += 4;
s->i_left = WORD_SIZE*8;
}
#include <string.h>
#include <assert.h>
#include <limits.h>
+
+/* Unions for type-punning without aliasing violations.
+ * Mn: load or store n bits, aligned, native-endian
+ * CPn: copy n bits, aligned, native-endian
+ * we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
+typedef union { uint16_t i; uint8_t c[2]; } x264_union16_t;
+typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } x264_union32_t;
+typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } x264_union64_t;
+#define M16(src) (((x264_union16_t*)(src))->i)
+#define M32(src) (((x264_union32_t*)(src))->i)
+#define M64(src) (((x264_union64_t*)(src))->i)
+#define CP16(dst,src) M16(dst) = M16(src)
+#define CP32(dst,src) M32(dst) = M32(src)
+#define CP64(dst,src) M64(dst) = M64(src)
+
#include "x264.h"
#include "bs.h"
#include "set.h"
static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
{
- *(uint32_t*)level = *(uint32_t*)dct;
+ CP32( level, dct );
ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
- *(uint32_t*)(level+6) = *(uint32_t*)(dct+6);
- *(uint64_t*)(level+8) = *(uint64_t*)(dct+8);
- *(uint64_t*)(level+12) = *(uint64_t*)(dct+12);
+ CP32( level+6, dct+6 );
+ CP64( level+8, dct+8 );
+ CP64( level+12, dct+12 );
}
#undef ZIG
nz |= level[i];\
}
#define COPY4x4\
- *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
- *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
- *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
- *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
+ CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+ CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+ CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+ CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
#define COPY8x8\
- *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
- *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
- *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
- *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
- *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
- *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
- *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
- *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
+ CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+ CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+ CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+ CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
+ CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
+ CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
+ CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
+ CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
{\
/* *** Get bS for each 4px for the current edge *** */\
if( IS_INTRA( h->mb.type[mb_xy] ) || IS_INTRA( h->mb.type[mbn_xy]) )\
- *(uint32_t*)bS = 0x03030303;\
+ M32( bS ) = 0x03030303;\
else\
{\
- *(uint32_t*)bS = 0x00000000;\
+ M32( bS ) = 0x00000000;\
for( i = 0; i < 4; i++ )\
{\
int x = i_dir == 0 ? i_edge : i;\
goto end##i_dir;\
}\
DEBLOCK_STRENGTH(i_dir);\
- if( *(uint32_t*)bS )\
+ if( M32( bS ) )\
FILTER_DIR( , i_dir);\
end##i_dir:\
i_edge += b_8x8_transform+1;\
for( ; i_edge < i_edge_end; i_edge+=b_8x8_transform+1 )\
{\
DEBLOCK_STRENGTH(i_dir);\
- if( *(uint32_t*)bS )\
+ if( M32( bS ) )\
FILTER_DIR( , i_dir);\
}\
}
{
if( i_refb == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_b;
+ CP32( mvp, mv_b );
return;
}
}
{
if( i_refa == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
return;
}
}
{
if( i_refa == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
return;
}
}
{
if( i_refc == i_ref )
{
- *(uint32_t*)mvp = *(uint32_t*)mv_c;
+ CP32( mvp, mv_c );
return;
}
}
else if( i_count == 1 )
{
if( i_refa == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else if( i_refb == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_b;
+ CP32( mvp, mv_b );
else
- *(uint32_t*)mvp = *(uint32_t*)mv_c;
+ CP32( mvp, mv_c );
}
else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else
goto median;
}
else if( i_count == 1 )
{
if( i_refa == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else if( i_refb == i_ref )
- *(uint32_t*)mvp = *(uint32_t*)mv_b;
+ CP32( mvp, mv_b );
else
- *(uint32_t*)mvp = *(uint32_t*)mv_c;
+ CP32( mvp, mv_c );
}
else if( i_refb == -2 && i_refc == -2 && i_refa != -2 )
- *(uint32_t*)mvp = *(uint32_t*)mv_a;
+ CP32( mvp, mv_a );
else
goto median;
}
int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
if( i_refa == -2 || i_refb == -2 ||
- !( i_refa | *(uint32_t*)mv_a ) ||
- !( i_refb | *(uint32_t*)mv_b ) )
+ !( i_refa | M32( mv_a ) ) ||
+ !( i_refb | M32( mv_b ) ) )
{
- *(uint32_t*)mv = 0;
+ M32( mv ) = 0;
}
else
{
if( ref[0] >= 0 )
x264_mb_predict_mv_16x16( h, 0, ref[0], mv[0] );
else
- {
- mv[0][0] = 0;
- mv[0][1] = 0;
- }
+ M32( mv[0] ) = 0;
+
if( ref[1] >= 0 )
x264_mb_predict_mv_16x16( h, 1, ref[1], mv[1] );
else
- {
- mv[1][0] = 0;
- mv[1][1] = 0;
- }
+ M32( mv[1] ) = 0;
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, ref[0] );
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, ref[1] );
{
*b_changed = h->mb.cache.direct_ref[0][0] != h->mb.cache.ref[0][X264_SCAN8_0]
|| h->mb.cache.direct_ref[1][0] != h->mb.cache.ref[1][X264_SCAN8_0]
- || *(uint32_t*)h->mb.cache.direct_mv[0][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[0][X264_SCAN8_0]
- || *(uint32_t*)h->mb.cache.direct_mv[1][X264_SCAN8_0] != *(uint32_t*)h->mb.cache.mv[1][X264_SCAN8_0];
+ || M32( h->mb.cache.direct_mv[0][X264_SCAN8_0] ) != M32( h->mb.cache.mv[0][X264_SCAN8_0] )
+ || M32( h->mb.cache.direct_mv[1][X264_SCAN8_0] ) != M32( h->mb.cache.mv[1][X264_SCAN8_0] );
}
else
{
const int y = 2*(idx/2);
x264_macroblock_cache_ref( h, x, y, 2, 2, 0, h->mb.cache.direct_ref[0][idx] );
x264_macroblock_cache_ref( h, x, y, 2, 2, 1, h->mb.cache.direct_ref[1][idx] );
- *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]] =
- *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]];
- *(uint64_t*)h->mb.cache.mv[0][x264_scan8[idx*4]+8] =
- *(uint64_t*)h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8];
- *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]] =
- *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]];
- *(uint64_t*)h->mb.cache.mv[1][x264_scan8[idx*4]+8] =
- *(uint64_t*)h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8];
+ CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+0] );
+ CP64( h->mb.cache.mv[0][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[0][x264_scan8[idx*4]+8] );
+ CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+0], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+0] );
+ CP64( h->mb.cache.mv[1][x264_scan8[idx*4]+8], h->mb.cache.direct_mv[1][x264_scan8[idx*4]+8] );
}
/* This just improves encoder performance, it's not part of the spec */
int i = 0;
#define SET_MVP(mvp) { \
- *(uint32_t*)mvc[i] = *(uint32_t*)mvp; \
+ CP32( mvc[i], mvp ); \
i++; \
}
{
int16_t (*lowres_mv)[2] = i_list ? h->fenc->lowres_mvs[1][h->fref1[0]->i_frame-h->fenc->i_frame-1]
: h->fenc->lowres_mvs[0][h->fenc->i_frame-h->fref0[0]->i_frame-1];
- if( lowres_mv[0][0] != 0x7fff ) *(uint32_t*)mvc[i++] = (*(uint32_t*)lowres_mv[h->mb.i_mb_xy]*2)&0xfffeffff;
+ if( lowres_mv[0][0] != 0x7fff )
+ {
+ M32( mvc[i] ) = (M32( lowres_mv[h->mb.i_mb_xy] )*2)&0xfffeffff;
+ i++;
+ }
}
/* spatial predictors */
h->mb.i_neighbour_intra |= MB_TOP;
/* load intra4x4 */
- *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.intra4x4_pred_mode[i_top_xy][0];
+ CP32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8], &h->mb.intra4x4_pred_mode[i_top_xy][0] );
/* load non_zero_count */
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] = *(uint32_t*)&h->mb.non_zero_count[i_top_xy][12];
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[0] - 8], &h->mb.non_zero_count[i_top_xy][12] );
/* shift because x264_scan8[16] is misaligned */
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][18] << 8;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = *(uint16_t*)&h->mb.non_zero_count[i_top_xy][22] << 8;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][18] ) << 8;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = M16( &h->mb.non_zero_count[i_top_xy][22] ) << 8;
}
else
{
h->mb.cache.i_cbp_top = -1;
/* load intra4x4 */
- *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] = 0xFFFFFFFFU;
+ M32( &h->mb.cache.intra4x4_pred_mode[x264_scan8[0] - 8] ) = 0xFFFFFFFFU;
/* load non_zero_count */
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0] - 8] =
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] =
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] = 0x80808080U;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8] ) = 0x80808080U;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+0] - 9] ) = 0x80808080U;
+ M32( &h->mb.cache.non_zero_count[x264_scan8[16+4] - 9] ) = 0x80808080U;
}
if( i_mb_x > 0 && i_mb_xy > h->sh.i_first_mb )
const int ir = i_top_8x8 - 1;
const int iv = i_top_4x4 - 1;
h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir];
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+ CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
}
else
{
const int i8 = x264_scan8[0] - 1 - 1*8;
h->mb.cache.ref[i_list][i8] = -2;
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+ M32( h->mb.cache.mv[i_list][i8] ) = 0;
}
if( h->mb.i_neighbour & MB_TOP )
h->mb.cache.ref[i_list][i8+1] = h->mb.ref[i_list][ir + 0];
h->mb.cache.ref[i_list][i8+2] =
h->mb.cache.ref[i_list][i8+3] = h->mb.ref[i_list][ir + 1];
- *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = *(uint64_t*)h->mb.mv[i_list][iv+0];
- *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = *(uint64_t*)h->mb.mv[i_list][iv+2];
+ CP64( h->mb.cache.mv[i_list][i8+0], h->mb.mv[i_list][iv+0] );
+ CP64( h->mb.cache.mv[i_list][i8+2], h->mb.mv[i_list][iv+2] );
}
else
{
const int i8 = x264_scan8[0] - 8;
- *(uint64_t*)h->mb.cache.mv[i_list][i8+0] = 0;
- *(uint64_t*)h->mb.cache.mv[i_list][i8+2] = 0;
- *(uint32_t*)&h->mb.cache.ref[i_list][i8] = (uint8_t)(-2) * 0x01010101U;
+ M64( h->mb.cache.mv[i_list][i8+0] ) = 0;
+ M64( h->mb.cache.mv[i_list][i8+2] ) = 0;
+ M32( &h->mb.cache.ref[i_list][i8] ) = (uint8_t)(-2) * 0x01010101U;
}
if( h->mb.i_neighbour & MB_TOPRIGHT )
const int ir = i_top_8x8 + 2;
const int iv = i_top_4x4 + 4;
h->mb.cache.ref[i_list][i8] = h->mb.ref[i_list][ir];
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = *(uint32_t*)h->mb.mv[i_list][iv];
+ CP32( h->mb.cache.mv[i_list][i8], h->mb.mv[i_list][iv] );
}
else
{
const int i8 = x264_scan8[0] + 4 - 1*8;
h->mb.cache.ref[i_list][i8] = -2;
- *(uint32_t*)h->mb.cache.mv[i_list][i8] = 0;
+ M32( h->mb.cache.mv[i_list][i8] ) = 0;
}
if( h->mb.i_neighbour & MB_LEFT )
h->mb.cache.ref[i_list][i8+2*8] =
h->mb.cache.ref[i_list][i8+3*8] = h->mb.ref[i_list][ir + 1*s8x8];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+0*8] = *(uint32_t*)h->mb.mv[i_list][iv + 0*s4x4];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+1*8] = *(uint32_t*)h->mb.mv[i_list][iv + 1*s4x4];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+2*8] = *(uint32_t*)h->mb.mv[i_list][iv + 2*s4x4];
- *(uint32_t*)h->mb.cache.mv[i_list][i8+3*8] = *(uint32_t*)h->mb.mv[i_list][iv + 3*s4x4];
+ CP32( h->mb.cache.mv[i_list][i8+0*8], h->mb.mv[i_list][iv + 0*s4x4] );
+ CP32( h->mb.cache.mv[i_list][i8+1*8], h->mb.mv[i_list][iv + 1*s4x4] );
+ CP32( h->mb.cache.mv[i_list][i8+2*8], h->mb.mv[i_list][iv + 2*s4x4] );
+ CP32( h->mb.cache.mv[i_list][i8+3*8], h->mb.mv[i_list][iv + 3*s4x4] );
}
else
{
for( i = 0; i < 4; i++ )
{
h->mb.cache.ref[i_list][i8+i*8] = -2;
- *(uint32_t*)h->mb.cache.mv[i_list][i8+i*8] = 0;
+ M32( h->mb.cache.mv[i_list][i8+i*8] ) = 0;
}
}
{
const int i8 = x264_scan8[0] - 8;
const int iv = i_top_4x4;
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] = *(uint64_t*)h->mb.mvd[i_list][iv+0];
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = *(uint64_t*)h->mb.mvd[i_list][iv+2];
+ CP64( h->mb.cache.mvd[i_list][i8+0], h->mb.mvd[i_list][iv+0] );
+ CP64( h->mb.cache.mvd[i_list][i8+2], h->mb.mvd[i_list][iv+2] );
}
else
{
const int i8 = x264_scan8[0] - 8;
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+0] =
- *(uint64_t*)h->mb.cache.mvd[i_list][i8+2] = 0;
+ M64( h->mb.cache.mvd[i_list][i8+0] ) = 0;
+ M64( h->mb.cache.mvd[i_list][i8+2] ) = 0;
}
if( i_left_type >= 0 )
{
const int i8 = x264_scan8[0] - 1;
const int iv = i_mb_4x4 - 1;
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+0*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 0*s4x4];
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+1*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 1*s4x4];
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+2*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 2*s4x4];
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+3*8] = *(uint32_t*)h->mb.mvd[i_list][iv + 3*s4x4];
+ CP32( h->mb.cache.mvd[i_list][i8+0*8], h->mb.mvd[i_list][iv + 0*s4x4] );
+ CP32( h->mb.cache.mvd[i_list][i8+1*8], h->mb.mvd[i_list][iv + 1*s4x4] );
+ CP32( h->mb.cache.mvd[i_list][i8+2*8], h->mb.mvd[i_list][iv + 2*s4x4] );
+ CP32( h->mb.cache.mvd[i_list][i8+3*8], h->mb.mvd[i_list][iv + 3*s4x4] );
}
else
{
const int i8 = x264_scan8[0] - 1;
for( i = 0; i < 4; i++ )
- *(uint32_t*)h->mb.cache.mvd[i_list][i8+i*8] = 0;
+ M32( h->mb.cache.mvd[i_list][i8+i*8] ) = 0;
}
}
}
/* save intra4x4 */
if( i_mb_type == I_4x4 )
{
- *(uint32_t*)&intra4x4_pred_mode[0] = *(uint32_t*)&h->mb.cache.intra4x4_pred_mode[x264_scan8[10] ];
- *(uint32_t*)&intra4x4_pred_mode[4] = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
- h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
- h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
+ CP32( &intra4x4_pred_mode[0], &h->mb.cache.intra4x4_pred_mode[x264_scan8[10]] );
+ M32( &intra4x4_pred_mode[4] ) = pack8to32(h->mb.cache.intra4x4_pred_mode[x264_scan8[5] ],
+ h->mb.cache.intra4x4_pred_mode[x264_scan8[7] ],
+ h->mb.cache.intra4x4_pred_mode[x264_scan8[13] ], 0);
}
else if( !h->param.b_constrained_intra || IS_INTRA(i_mb_type) )
- *(uint64_t*)intra4x4_pred_mode = I_PRED_4x4_DC * 0x0101010101010101ULL;
+ M64( intra4x4_pred_mode ) = I_PRED_4x4_DC * 0x0101010101010101ULL;
else
- *(uint64_t*)intra4x4_pred_mode = (uint8_t)(-1) * 0x0101010101010101ULL;
+ M64( intra4x4_pred_mode ) = (uint8_t)(-1) * 0x0101010101010101ULL;
if( i_mb_type == I_PCM )
else
{
/* save non zero count */
- *(uint32_t*)&non_zero_count[0*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+0*8];
- *(uint32_t*)&non_zero_count[1*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+1*8];
- *(uint32_t*)&non_zero_count[2*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+2*8];
- *(uint32_t*)&non_zero_count[3*4] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[0]+3*8];
- *(uint16_t*)&non_zero_count[16+0*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] >> 8;
- *(uint16_t*)&non_zero_count[16+1*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] >> 8;
- *(uint16_t*)&non_zero_count[16+2*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] >> 8;
- *(uint16_t*)&non_zero_count[16+3*2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] >> 8;
+ CP32( &non_zero_count[0*4], &h->mb.cache.non_zero_count[x264_scan8[0]+0*8] );
+ CP32( &non_zero_count[1*4], &h->mb.cache.non_zero_count[x264_scan8[0]+1*8] );
+ CP32( &non_zero_count[2*4], &h->mb.cache.non_zero_count[x264_scan8[0]+2*8] );
+ CP32( &non_zero_count[3*4], &h->mb.cache.non_zero_count[x264_scan8[0]+3*8] );
+ M16( &non_zero_count[16+0*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+0*2]-1] ) >> 8;
+ M16( &non_zero_count[16+1*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+1*2]-1] ) >> 8;
+ M16( &non_zero_count[16+2*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+2*2]-1] ) >> 8;
+ M16( &non_zero_count[16+3*2] ) = M32( &h->mb.cache.non_zero_count[x264_scan8[16+3*2]-1] ) >> 8;
if( h->mb.i_type != I_16x16 && h->mb.i_cbp_luma == 0 && h->mb.i_cbp_chroma == 0 )
h->mb.i_qp = h->mb.i_last_qp;
h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mv[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[0][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
}
if( h->sh.i_type == SLICE_TYPE_B )
{
h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mv[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mv[1][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
}
}
}
int i_list;
for( i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
{
- *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+0*s8x8] = (uint8_t)(-1) * 0x0101;
- *(uint16_t*)&h->mb.ref[i_list][i_mb_8x8+1*s8x8] = (uint8_t)(-1) * 0x0101;
+ M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+ M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] = 0;
- *(uint64_t*)h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] = 0;
+ M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
+ M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
}
}
}
{
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[0][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[0][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[0][x264_scan8[0]+8*y+2] );
}
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+0];
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = *(uint64_t*)h->mb.cache.mvd[1][x264_scan8[0]+8*y+2];
+ CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mvd[1][x264_scan8[0]+8*y+0] );
+ CP64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mvd[1][x264_scan8[0]+8*y+2] );
}
}
else
{
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+0] = 0;
- *(uint64_t*)h->mb.mvd[0][i_mb_4x4+y*s4x4+2] = 0;
+ M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+0] ) = 0;
+ M64( h->mb.mvd[0][i_mb_4x4+y*s4x4+2] ) = 0;
}
if( h->sh.i_type == SLICE_TYPE_B )
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+0] = 0;
- *(uint64_t*)h->mb.mvd[1][i_mb_4x4+y*s4x4+2] = 0;
+ M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+0] ) = 0;
+ M64( h->mb.mvd[1][i_mb_4x4+y*s4x4+2] ) = 0;
}
}
}
static ALWAYS_INLINE void x264_macroblock_cache_rect1( void *dst, int width, int height, uint8_t val )
{
+ uint32_t *d = dst;
if( width == 4 )
{
uint32_t val2 = val * 0x01010101;
- ((uint32_t*)dst)[0] = val2;
- if( height >= 2 ) ((uint32_t*)dst)[2] = val2;
- if( height == 4 ) ((uint32_t*)dst)[4] = val2;
- if( height == 4 ) ((uint32_t*)dst)[6] = val2;
+ M32( d+0 ) = val2;
+ if( height >= 2 ) M32( d+2 ) = val2;
+ if( height == 4 ) M32( d+4 ) = val2;
+ if( height == 4 ) M32( d+6 ) = val2;
}
else // 2
{
uint32_t val2 = val * 0x0101;
- ((uint16_t*)dst)[ 0] = val2;
- if( height >= 2 ) ((uint16_t*)dst)[ 4] = val2;
- if( height == 4 ) ((uint16_t*)dst)[ 8] = val2;
- if( height == 4 ) ((uint16_t*)dst)[12] = val2;
+ M16( d+0 ) = val2;
+ if( height >= 2 ) M16( d+2 ) = val2;
+ if( height == 4 ) M16( d+4 ) = val2;
+ if( height == 4 ) M16( d+6 ) = val2;
}
}
static ALWAYS_INLINE void x264_macroblock_cache_rect4( void *dst, int width, int height, uint32_t val )
int dy;
if( width == 1 || WORD_SIZE < 8 )
{
+ uint32_t *d = dst;
for( dy = 0; dy < height; dy++ )
{
- ((uint32_t*)dst)[8*dy+0] = val;
- if( width >= 2 ) ((uint32_t*)dst)[8*dy+1] = val;
- if( width == 4 ) ((uint32_t*)dst)[8*dy+2] = val;
- if( width == 4 ) ((uint32_t*)dst)[8*dy+3] = val;
+ M32( d+8*dy+0 ) = val;
+ if( width >= 2 ) M32( d+8*dy+1 ) = val;
+ if( width == 4 ) M32( d+8*dy+2 ) = val;
+ if( width == 4 ) M32( d+8*dy+3 ) = val;
}
}
else
{
uint64_t val64 = val + ((uint64_t)val<<32);
+ uint64_t *d = dst;
for( dy = 0; dy < height; dy++ )
{
- ((uint64_t*)dst)[4*dy+0] = val64;
- if( width == 4 ) ((uint64_t*)dst)[4*dy+1] = val64;
+ M64( d+4*dy+0 ) = val64;
+ if( width == 4 ) M64( d+4*dy+1 ) = val64;
}
}
}
-#define x264_macroblock_cache_mv_ptr(a,x,y,w,h,l,mv) x264_macroblock_cache_mv(a,x,y,w,h,l,*(uint32_t*)mv)
+#define x264_macroblock_cache_mv_ptr( a, x, y, w, h, l, mv ) x264_macroblock_cache_mv( a, x, y, w, h, l, M32( mv ) )
static ALWAYS_INLINE void x264_macroblock_cache_mv( x264_t *h, int x, int y, int width, int height, int i_list, uint32_t mv )
{
x264_macroblock_cache_rect4( &h->mb.cache.mv[i_list][X264_SCAN8_0+x+8*y], width, height, mv );
return 0;
if( h->mb.i_type != P_8x8 )
return partition_tab[h->mb.i_type];
- return *(uint32_t*)h->mb.i_sub_partition == D_L0_8x8*0x01010101;
+ return M32( h->mb.i_sub_partition ) == D_L0_8x8*0x01010101;
}
#endif
#define PREDICT_16x16_DC(v) \
for( i = 0; i < 16; i++ )\
{\
- uint32_t *p = (uint32_t*)src;\
- *p++ = v;\
- *p++ = v;\
- *p++ = v;\
- *p++ = v;\
+ M32( src+ 0 ) = v;\
+ M32( src+ 4 ) = v;\
+ M32( src+ 8 ) = v;\
+ M32( src+12 ) = v;\
src += FDEC_STRIDE;\
}
for( i = 0; i < 16; i++ )
{
const uint32_t v = 0x01010101 * src[-1];
- uint32_t *p = (uint32_t*)src;
-
- *p++ = v;
- *p++ = v;
- *p++ = v;
- *p++ = v;
-
+ M32( src+ 0 ) = v;
+ M32( src+ 4 ) = v;
+ M32( src+ 8 ) = v;
+ M32( src+12 ) = v;
src += FDEC_STRIDE;
}
}
static void predict_16x16_v( uint8_t *src )
{
- uint32_t v0 = *(uint32_t*)&src[ 0-FDEC_STRIDE];
- uint32_t v1 = *(uint32_t*)&src[ 4-FDEC_STRIDE];
- uint32_t v2 = *(uint32_t*)&src[ 8-FDEC_STRIDE];
- uint32_t v3 = *(uint32_t*)&src[12-FDEC_STRIDE];
+ uint32_t v0 = M32( &src[ 0-FDEC_STRIDE] );
+ uint32_t v1 = M32( &src[ 4-FDEC_STRIDE] );
+ uint32_t v2 = M32( &src[ 8-FDEC_STRIDE] );
+ uint32_t v3 = M32( &src[12-FDEC_STRIDE] );
int i;
for( i = 0; i < 16; i++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = v0;
- *p++ = v1;
- *p++ = v2;
- *p++ = v3;
+ M32( src+ 0 ) = v0;
+ M32( src+ 4 ) = v1;
+ M32( src+ 8 ) = v2;
+ M32( src+12 ) = v3;
src += FDEC_STRIDE;
}
}
for( y = 0; y < 8; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = 0x80808080;
- *p++ = 0x80808080;
+ M32( src+0 ) = 0x80808080;
+ M32( src+4 ) = 0x80808080;
src += FDEC_STRIDE;
}
}
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc0;
- *p++ = dc0;
+ M32( src+0 ) = dc0;
+ M32( src+4 ) = dc0;
src += FDEC_STRIDE;
}
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc1;
- *p++ = dc1;
+ M32( src+0 ) = dc1;
+ M32( src+4 ) = dc1;
src += FDEC_STRIDE;
}
for( y = 0; y < 8; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc0;
- *p++ = dc1;
+ M32( src+0 ) = dc0;
+ M32( src+4 ) = dc1;
src += FDEC_STRIDE;
}
}
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc0;
- *p++ = dc1;
+ M32( src+0 ) = dc0;
+ M32( src+4 ) = dc1;
src += FDEC_STRIDE;
}
for( y = 0; y < 4; y++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = dc2;
- *p++ = dc3;
+ M32( src+0 ) = dc2;
+ M32( src+4 ) = dc3;
src += FDEC_STRIDE;
}
}
for( i = 0; i < 8; i++ )
{
uint32_t v = 0x01010101 * src[-1];
- uint32_t *p = (uint32_t*)src;
- *p++ = v;
- *p++ = v;
+ M32( src+0 ) = v;
+ M32( src+4 ) = v;
src += FDEC_STRIDE;
}
}
static void predict_8x8c_v( uint8_t *src )
{
- uint32_t v0 = *(uint32_t*)&src[0-FDEC_STRIDE];
- uint32_t v1 = *(uint32_t*)&src[4-FDEC_STRIDE];
+ uint32_t v0 = M32( src+0-FDEC_STRIDE );
+ uint32_t v1 = M32( src+4-FDEC_STRIDE );
int i;
for( i = 0; i < 8; i++ )
{
- uint32_t *p = (uint32_t*)src;
- *p++ = v0;
- *p++ = v1;
+ M32( src+0 ) = v0;
+ M32( src+4 ) = v1;
src += FDEC_STRIDE;
}
}
****************************************************************************/
#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-#define SRC32(x,y) *(uint32_t*)&SRC(x,y)
+#define SRC32(x,y) M32( &SRC(x,y) )
#define PREDICT_4x4_DC(v)\
SRC32(0,0) = SRC32(0,1) = SRC32(0,2) = SRC32(0,3) = v;
}
else
{
- *(uint64_t*)(edge+24) = SRC(7,-1) * 0x0101010101010101ULL;
+ M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL;
edge[32] = SRC(7,-1);
}
}
#define PREDICT_8x8_DC(v) \
int y; \
for( y = 0; y < 8; y++ ) { \
- ((uint32_t*)src)[0] = \
- ((uint32_t*)src)[1] = v; \
+ M32( src+0 ) = v; \
+ M32( src+4 ) = v; \
src += FDEC_STRIDE; \
}
static void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
{
PREDICT_8x8_LOAD_LEFT
-#define ROW(y) ((uint32_t*)(src+y*FDEC_STRIDE))[0] =\
- ((uint32_t*)(src+y*FDEC_STRIDE))[1] = 0x01010101U * l##y
+#define ROW(y) M32( src+y*FDEC_STRIDE+0 ) =\
+ M32( src+y*FDEC_STRIDE+4 ) = 0x01010101U * l##y;
ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
#undef ROW
}
static void predict_8x8_v( uint8_t *src, uint8_t edge[33] )
{
- const uint64_t top = *(uint64_t*)(edge+16);
+ const uint64_t top = M64( edge+16 );
int y;
for( y = 0; y < 8; y++ )
- *(uint64_t*)(src+y*FDEC_STRIDE) = top;
+ M64( src+y*FDEC_STRIDE ) = top;
}
static void predict_8x8_ddl( uint8_t *src, uint8_t edge[33] )
{
int idx = i_max - 1;
/* Yes, dct[idx-1] is guaranteed to be 32-bit aligned. idx>=0 instead of 1 works correctly for the same reason */
- while( idx >= 0 && *(uint32_t*)&dct[idx-1] == 0 )
+ while( idx >= 0 && M32( &dct[idx-1] ) == 0 )
idx -= 2;
if( idx >= 0 && dct[idx] == 0 )
idx--;
{
int i_last;
for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
- if( *(uint64_t*)(l+i_last-3) )
+ if( M64( l+i_last-3 ) )
break;
while( i_last >= 0 && l[i_last] == 0 )
i_last--;
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)src = dc0;
+ M64( src ) = dc0;
src += FDEC_STRIDE;
}
for( y = 0; y < 4; y++ )
{
- *(uint64_t*)src = dc1;
+ M64( src ) = dc1;
src += FDEC_STRIDE;
}
#define PREDICT_8x8_DC(v) \
int y; \
for( y = 0; y < 8; y++ ) { \
- ((uint32_t*)src)[0] = \
- ((uint32_t*)src)[1] = v; \
+ M32( src+0 ) = v; \
+ M32( src+4 ) = v; \
src += FDEC_STRIDE; \
}
"pminsw %%mm2, %%mm0 \n"
"pmaxsw %%mm1, %%mm0 \n"
"movd %%mm0, %0 \n"
- :"=m"(*(uint32_t*)dst)
- :"m"(*(uint32_t*)a), "m"(*(uint32_t*)b), "m"(*(uint32_t*)c)
+ :"=m"(*(x264_union32_t*)dst)
+ :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
);
}
#define x264_predictor_difference x264_predictor_difference_mmxext
"jg 1b \n"
"movq %%mm4, %0 \n"
:"=m"(output), "+r"(i_mvc)
- :"r"(mvc), "m"(*(struct {int16_t x[4];} *)mvc)
+ :"r"(mvc), "m"(M64( mvc ))
);
sum += output[0] + output[1] + output[2] + output[3];
return sum;
"pminsw %5, %%mm0 \n"
"movd %%mm0, %0 \n"
:"=r"(amvd)
- :"m"(*(uint32_t*)mvdleft),"m"(*(uint32_t*)mvdtop),
+ :"m"(M32( mvdleft )),"m"(M32( mvdtop )),
"m"(pw_28),"m"(pw_2184),"m"(pw_2)
);
return amvd;
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
- h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
- h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
- h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
- h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+ h->mb.pic.i8x8_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+ h->mb.pic.i8x8_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+ h->mb.pic.i8x8_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+ h->mb.pic.i8x8_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma;
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+ M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
if( b_merged_satd && i_max >= 6 )
{
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 );
- h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]];
- h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]];
- h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]];
- h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]];
+ h->mb.pic.i4x4_nnz_buf[0] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] );
+ h->mb.pic.i4x4_nnz_buf[1] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] );
+ h->mb.pic.i4x4_nnz_buf[2] = M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] );
+ h->mb.pic.i4x4_nnz_buf[3] = M32( &h->mb.cache.non_zero_count[x264_scan8[10]] );
h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma;
if( h->mb.i_skip_intra == 2 )
h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+ M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
for( i = 0; i < i_max; i++ )
{
{
a->i_predict4x4[idx] = i_mode;
i_best = i_satd;
- pels[0] = *(uint32_t*)(p_dst_by+0*FDEC_STRIDE);
- pels[1] = *(uint32_t*)(p_dst_by+1*FDEC_STRIDE);
- pels[2] = *(uint32_t*)(p_dst_by+2*FDEC_STRIDE);
- pels[3] = *(uint32_t*)(p_dst_by+3*FDEC_STRIDE);
+ pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
+ pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
+ pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
+ pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
}
}
- *(uint32_t*)(p_dst_by+0*FDEC_STRIDE) = pels[0];
- *(uint32_t*)(p_dst_by+1*FDEC_STRIDE) = pels[1];
- *(uint32_t*)(p_dst_by+2*FDEC_STRIDE) = pels[2];
- *(uint32_t*)(p_dst_by+3*FDEC_STRIDE) = pels[3];
+ M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
+ M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
+ M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
+ M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
cbp_luma_new = h->mb.i_cbp_luma;
i_best = i_satd;
- pels_h = *(uint64_t*)(p_dst_by+7*FDEC_STRIDE);
+ pels_h = M64( p_dst_by+7*FDEC_STRIDE );
if( !(idx&1) )
for( j=0; j<7; j++ )
pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
- i_nnz[0] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]];
- i_nnz[1] = *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]];
+ i_nnz[0] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] );
+ i_nnz[1] = M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] );
}
}
a->i_cbp_i8x8_luma = cbp_luma_new;
- *(uint64_t*)(p_dst_by+7*FDEC_STRIDE) = pels_h;
+ M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
if( !(idx&1) )
for( j=0; j<7; j++ )
p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] = i_nnz[0];
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] = i_nnz[1];
+ M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+0]] ) = i_nnz[0];
+ M16( &h->mb.cache.non_zero_count[x264_scan8[4*idx+2]] ) = i_nnz[1];
x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
}
h->mc.memcpy_aligned( &a->l0.me16x16, &m, sizeof(x264_me_t) );
/* save mv for predicting neighbors */
- *(uint32_t*)a->l0.mvc[i_ref][0] =
- *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+ CP32( a->l0.mvc[i_ref][0], m.mv );
+ CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
}
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 0, a->l0.me16x16.i_ref );
if( a->i_mbrd )
{
x264_mb_cache_fenc_satd( h );
- if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv )
+ if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) )
{
h->mb.i_partition = D_16x16;
x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv );
}
for( i_ref = 0; i_ref <= i_maxref; i_ref++ )
- *(uint32_t*)a->l0.mvc[i_ref][0] = *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy];
+ CP32( a->l0.mvc[i_ref][0], h->mb.mvr[0][i_ref][h->mb.i_mb_xy] );
for( i = 0; i < 4; i++ )
{
m.cost += i_ref_cost;
i_halfpel_thresh += i_ref_cost;
- *(uint32_t*)a->l0.mvc[i_ref][i+1] = *(uint32_t*)m.mv;
+ CP32( a->l0.mvc[i_ref][i+1], m.mv );
if( m.cost < l0m->cost )
h->mc.memcpy_aligned( l0m, &m, sizeof(x264_me_t) );
h->mb.i_partition = D_8x8;
i_mvc = 1;
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.me16x16.mv;
+ CP32( mvc[0], a->l0.me16x16.mv );
for( i = 0; i < 4; i++ )
{
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, 0, m->mv );
- *(uint32_t*)mvc[i_mvc] = *(uint32_t*)m->mv;
+ CP32( mvc[i_mvc], m->mv );
i_mvc++;
/* mb type cost */
m.i_ref_cost = i_ref_cost;
/* if we skipped the 16x16 predictor, we wouldn't have to copy anything... */
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
- *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][2*i+1];
- *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][2*i+2];
+ CP32( mvc[0], a->l0.mvc[i_ref][0] );
+ CP32( mvc[1], a->l0.mvc[i_ref][2*i+1] );
+ CP32( mvc[2], a->l0.mvc[i_ref][2*i+2] );
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 0, 8*i );
LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 0, 8*i );
const int i_ref_cost = REF_COST( 0, i_ref );
m.i_ref_cost = i_ref_cost;
- *(uint32_t*)mvc[0] = *(uint32_t*)a->l0.mvc[i_ref][0];
- *(uint32_t*)mvc[1] = *(uint32_t*)a->l0.mvc[i_ref][i+1];
- *(uint32_t*)mvc[2] = *(uint32_t*)a->l0.mvc[i_ref][i+3];
+ CP32( mvc[0], a->l0.mvc[i_ref][0] );
+ CP32( mvc[1], a->l0.mvc[i_ref][i+1] );
+ CP32( mvc[2], a->l0.mvc[i_ref][i+3] );
LOAD_HPELS( &m, h->mb.pic.p_fref[0][i_ref], 0, i_ref, 8*i, 0 );
LOAD_WPELS( &m, h->mb.pic.p_fref_w[i_ref], 0, i_ref, 8*i, 0 );
}
/* save mv for predicting neighbors */
- *(uint32_t*)h->mb.mvr[0][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+ CP32( h->mb.mvr[0][i_ref][h->mb.i_mb_xy], m.mv );
}
a->l0.me16x16.i_ref = a->l0.i_ref;
}
/* save mv for predicting neighbors */
- *(uint32_t*)h->mb.mvr[1][i_ref][h->mb.i_mb_xy] = *(uint32_t*)m.mv;
+ CP32( h->mb.mvr[1][i_ref][h->mb.i_mb_xy], m.mv );
}
a->l1.me16x16.i_ref = a->l1.i_ref;
LOAD_FENC( m, h->mb.pic.p_fenc, 0, 8*i );
LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 0, 8*i );
- *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[2*i].mv;
- *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[2*i+1].mv;
+ CP32( mvc[0], lX->me8x8[2*i].mv );
+ CP32( mvc[1], lX->me8x8[2*i+1].mv );
x264_mb_predict_mv( h, l, 8*i, 2, m->mvp );
x264_me_search( h, m, mvc, 2 );
LOAD_FENC( m, h->mb.pic.p_fenc, 8*i, 0 );
LOAD_HPELS( m, p_fref[l], l, lX->i_ref, 8*i, 0 );
- *(uint32_t*)mvc[0] = *(uint32_t*)lX->me8x8[i].mv;
- *(uint32_t*)mvc[1] = *(uint32_t*)lX->me8x8[i+2].mv;
+ CP32( mvc[0], lX->me8x8[i].mv );
+ CP32( mvc[1], lX->me8x8[i+2].mv );
x264_mb_predict_mv( h, l, 4*i, 2, m->mvp );
x264_me_search( h, m, mvc, 2 );
static const uint8_t check_mv_lists[X264_MBTYPE_MAX] = {[P_L0]=1, [B_L0_L0]=1, [B_L1_L1]=2};
int list = check_mv_lists[h->mb.i_type] - 1;
if( list >= 0 && h->mb.i_partition != D_16x16 &&
- *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[0]] == *(uint32_t*)&h->mb.cache.mv[list][x264_scan8[12]] &&
+ M32( &h->mb.cache.mv[list][x264_scan8[0]] ) == M32( &h->mb.cache.mv[list][x264_scan8[12]] ) &&
h->mb.cache.ref[list][x264_scan8[0]] == h->mb.cache.ref[list][x264_scan8[12]] )
h->mb.i_partition = D_16x16;
}
#define STORE_8x8_NNZ(idx,nz)\
{\
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\
- *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\
+ M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] ) = nz * 0x0101;\
+ M16( &h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] ) = nz * 0x0101;\
+}
+
+#define CLEAR_16x16_NNZ \
+{\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = 0;\
+ M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = 0;\
}
void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
if( decimate_score < 6 )
{
h->mb.i_cbp_luma = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ CLEAR_16x16_NNZ
}
h->dctf.dct4x4dc( dct_dc4x4 );
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 );
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i8x8_nnz_buf[0];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i8x8_nnz_buf[1];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i8x8_nnz_buf[2];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i8x8_nnz_buf[3];
h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp;
/* In RD mode, restore the now-overwritten DCT data. */
if( h->mb.i_skip_intra == 2 )
if( h->mb.i_skip_intra )
{
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 );
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2];
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = h->mb.pic.i4x4_nnz_buf[0];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 2]] ) = h->mb.pic.i4x4_nnz_buf[1];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[ 8]] ) = h->mb.pic.i4x4_nnz_buf[2];
+ M32( &h->mb.cache.non_zero_count[x264_scan8[10]] ) = h->mb.pic.i4x4_nnz_buf[3];
h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp;
/* In RD mode, restore the now-overwritten DCT data. */
if( h->mb.i_skip_intra == 2 )
if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- *(uint32_t*) &p_dst[4-FDEC_STRIDE] = p_dst[3-FDEC_STRIDE] * 0x01010101U;
+ M32( &p_dst[4-FDEC_STRIDE] ) = p_dst[3-FDEC_STRIDE] * 0x01010101U;
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst, i, i_mode );
if( i_decimate_mb < 6 && b_decimate )
{
h->mb.i_cbp_luma = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ CLEAR_16x16_NNZ
}
else
{
if( i_decimate_mb < 6 )
{
h->mb.i_cbp_luma = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0;
- *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0;
+ CLEAR_16x16_NNZ
}
else
{
{
if( h->mb.i_type == P_L0 && h->mb.i_partition == D_16x16 &&
!(h->mb.i_cbp_luma | h->mb.i_cbp_chroma) &&
- *(uint32_t*)h->mb.cache.mv[0][x264_scan8[0]] == *(uint32_t*)h->mb.cache.pskip_mv
+ M32( h->mb.cache.mv[0][x264_scan8[0]] ) == M32( h->mb.cache.pskip_mv )
&& h->mb.cache.ref[0][x264_scan8[0]] == 0 )
{
h->mb.i_type = P_SKIP;
COST_MV_HPEL( bmx, bmy );
for( i = 0; i < i_mvc; i++ )
{
- if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) )
+ if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
{
int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
{
/* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
- *(uint64_t*)&mvsads[i] = *(uint64_t*)&mvsads[j];
+ CP64( &mvsads[i], &mvsads[j] );
else
mvsads[i] = mvsads[j];
i += mvsads[j].sad <= sad_thresh;
nmvsad--;
mvsads[bi] = mvsads[nmvsad];
if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
- *(uint64_t*)&mvsads[bi] = *(uint64_t*)&mvsads[nmvsad];
+ CP64( &mvsads[bi], &mvsads[nmvsad] );
else
mvsads[bi] = mvsads[nmvsad];
}
if( cost < bcost * SATD_THRESH )
{
bcost = X264_MIN( cost, bcost );
- *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y);
- *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y);
+ M32( cache0_mv ) = pack16to32_mask(m0x,m0y);
+ M32( cache0_mv2 ) = pack16to32_mask(m0x,m0y);
+ M32( cache1_mv ) = pack16to32_mask(m1x,m1y);
+ M32( cache1_mv2 ) = pack16to32_mask(m1x,m1y);
h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
if( satd <= bsatd * SATD_THRESH ) \
{ \
uint64_t cost; \
- *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \
+ M32( cache_mv ) = pack16to32_mask(mx,my); \
+ M32( cache_mv2 ) = pack16to32_mask(mx,my); \
cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
ALIGNED_4( int16_t mvc[4][2] );
/* Reverse-order MV prediction. */
- *(uint32_t*)mvc[0] = 0;
- *(uint32_t*)mvc[1] = 0;
- *(uint32_t*)mvc[2] = 0;
-#define MVC(mv) { *(uint32_t*)mvc[i_mvc] = *(uint32_t*)mv; i_mvc++; }
+ M32( mvc[0] ) = 0;
+ M32( mvc[1] ) = 0;
+ M32( mvc[2] ) = 0;
+#define MVC(mv) { CP32( mvc[i_mvc], mv ); i_mvc++; }
if( i_mb_x < h->sps->i_mb_width - 1 )
MVC(fenc_mv[1]);
if( i_mb_y < h->sps->i_mb_height - 1 )
x264_me_search( h, &m[l], mvc, i_mvc );
m[l].cost -= 2; // remove mvcost from skip mbs
- if( *(uint32_t*)m[l].mv )
+ if( M32( m[l].mv ) )
m[l].cost += 5;
- *(uint32_t*)fenc_mvs[l] = *(uint32_t*)m[l].mv;
+ CP32( fenc_mvs[l], m[l].mv );
*fenc_costs[l] = m[l].cost;
}
else
{
- *(uint32_t*)m[l].mv = *(uint32_t*)fenc_mvs[l];
+ CP32( m[l].mv, fenc_mvs[l] );
m[l].cost = *fenc_costs[l];
}
COPY2_IF_LT( i_bcost, m[l].cost, list_used, l+1 );
}
- if( b_bidir && ( *(uint32_t*)m[0].mv || *(uint32_t*)m[1].mv ) )
+ if( b_bidir && ( M32( m[0].mv ) || M32( m[1].mv ) ) )
TRY_BIDIR( m[0].mv, m[1].mv, 5 );
/* Store to width-2 bitfield. */