Drop ALIGNED_N and ALIGNED_ARRAY_N in favor of using explicit alignment.
This will allow us to increase the native alignment without unnecessarily
increasing the alignment of everything that's currently 32-byte aligned.
/* Current MB DCT coeffs */
struct
{
- ALIGNED_N( dctcoef luma16x16_dc[3][16] );
+ ALIGNED_32( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
- ALIGNED_N( dctcoef luma8x8[12][64] );
- ALIGNED_N( dctcoef luma4x4[16*3][16] );
+ ALIGNED_32( dctcoef luma8x8[12][64] );
+ ALIGNED_32( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
- ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] );
- ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
+ ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] );
+ ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
ALIGNED_16( dctcoef fenc_dct4[16][16] );
/* Psy RD SATD/SA8D scores cache */
- ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
- ALIGNED_N( uint32_t fenc_satd_cache[32] );
+ ALIGNED_32( uint64_t fenc_hadamard_cache[9] );
+ ALIGNED_32( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
uint32_t (*nr_residual_sum)[64];
uint32_t *nr_count;
- ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
- ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
+ ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
+ ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
uint32_t nr_count_buf[2][4];
uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
intptr_t i_stride0 = 16, i_stride1 = 16;
- ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
- ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
+ ALIGNED_ARRAY_32( pixel, tmp0,[16*16] );
+ ALIGNED_ARRAY_32( pixel, tmp1,[16*16] );
pixel *src0, *src1;
MC_LUMA_BI( 0 );
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
-#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
-#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
-#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
+
#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
+#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
+#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
// ARM compiliers don't reliably align stack variables
// - EABI requires only 8 byte stack alignment to be maintained
#if ARCH_ARM && SYS_MACOSX
#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
#else
-#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
- ALIGNED_8( type name sub1 __VA_ARGS__ )
+#define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ )
#endif
#if ARCH_ARM
#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
#else
-#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
- ALIGNED_16( type name sub1 __VA_ARGS__ )
+#define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif
#define EXPAND(x) x
+#if ARCH_X86 || ARCH_X86_64
+#define NATIVE_ALIGN 32
+#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#if STACK_ALIGNMENT >= 32
-#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
- ALIGNED_32( type name sub1 __VA_ARGS__ )
+#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#endif
-
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
-
-/* For AVX2 */
-#if ARCH_X86 || ARCH_X86_64
-#define NATIVE_ALIGN 32
-#define ALIGNED_N ALIGNED_32
-#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
#else
#define NATIVE_ALIGN 16
-#define ALIGNED_N ALIGNED_16
-#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
+#define ALIGNED_32 ALIGNED_16
+#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
+#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
#endif
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
pixel **p_fref, int i8x8, int size, int chroma )
{
- ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
+ ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *pix2 = pix1+8;
int i_stride = h->mb.pic.i_stride[1];
int chroma_h_shift = chroma <= CHROMA_422;
static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
{
- ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
- ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
+ ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] );
+ ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] );
int i_chroma_cost = 0;
int chromapix = h->luma2chroma_pixel[i_pixel];
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
- ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
- ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
+ ALIGNED_ARRAY_32( pixel, pix0,[16*16] );
+ ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *src0, *src1;
intptr_t stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
}
else
{
- ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] );
+ ALIGNED_ARRAY_32( pixel, pixuv, [2],[16*FENC_STRIDE] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int v_shift = CHROMA_V_SHIFT;
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
- ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
+ ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] );
ALIGNED_4( int16_t mvc[3][2] );
h->mb.i_partition = D_16x8;
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
- ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
if( h->mb.b_lossless )
{
}
else if( h->mb.b_transform_8x8 )
{
- ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
+ ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
}
else
{
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
int quant_cat = p ? CQM_8PC : CQM_8PY;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
- ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
int i_decimate_8x8 = b_decimate ? 0 : 4;
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] );
int nnz8x8 = 0;
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] );
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
}
else
{
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
- ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
if( b_predict )
{
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
- ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
+ ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( b_predict )
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
- ALIGNED_ARRAY_N( pixel, pix,[16*16] );
+ ALIGNED_ARRAY_32( pixel, pix,[16*16] );
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
ALIGNED_ARRAY_16( int, costs,[16] );
int chroma_v_shift = CHROMA_V_SHIFT;
int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
+ ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
ALIGNED_ARRAY_16( int, costs,[4] );
int bmx = m->mv[0];
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] );
pixel *src[3][2][9];
int chromapix = h->luma2chroma_pixel[i_pixel];
int chroma_v_shift = CHROMA_V_SHIFT;
uint64_t bcostrd = COST_MAX64;
uint16_t amvd;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
+ ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
stride <<= b_field;
if( b_chroma )
{
- ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] );
+ ALIGNED_ARRAY_32( pixel, pix,[FENC_STRIDE*16] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int shift = 7 - CHROMA_V_SHIFT;
const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
int b_chroma, int dc, int num_coefs, int idx )
{
- ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
- ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
+ ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] );
+ ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, interlace = 0;
- ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
- ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
- ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
- ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
+ ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_t *h = &h_buf;
ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
- ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
+ ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] );
memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
for( int j = 0; j < X264_SCAN8_SIZE; j++ )
nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
- ALIGNED_ARRAY_N( dctcoef, dct1,[64] );
- ALIGNED_ARRAY_N( dctcoef, dct2,[64] );
- ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] );
- ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] );
- ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] );
+ ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
+ ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
+ ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
+ ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
+ ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
int ret = 0, ok, used_asm;
int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
x264_t h_buf;
{\
for( int j = 0; j < 256; j++ )\
{\
- ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
+ ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
uint8_t bitstream[2][1<<16];\
static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
int ac = ctx_ac[ctx_block_cat];\