From: Henrik Gramner Date: Sun, 29 Jan 2017 15:41:33 +0000 (+0100) Subject: osdep: Rework alignment macros X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d13b4c3a9574cd2fbd5407c7dfc58eeff72d2080;p=libx264 osdep: Rework alignment macros Drop ALIGNED_N and ALIGNED_ARRAY_N in favor of using explicit alignment. This will allow us to increase the native alignment without unnecessarily increasing the alignment of everything that's currently 32-byte aligned. --- diff --git a/common/common.h b/common/common.h index c7850ca0..8cc1dc1e 100644 --- a/common/common.h +++ b/common/common.h @@ -635,11 +635,11 @@ struct x264_t /* Current MB DCT coeffs */ struct { - ALIGNED_N( dctcoef luma16x16_dc[3][16] ); + ALIGNED_32( dctcoef luma16x16_dc[3][16] ); ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? - ALIGNED_N( dctcoef luma8x8[12][64] ); - ALIGNED_N( dctcoef luma4x4[16*3][16] ); + ALIGNED_32( dctcoef luma8x8[12][64] ); + ALIGNED_32( dctcoef luma4x4[16*3][16] ); } dct; /* MB table and cache for current frame/mb */ @@ -778,8 +778,8 @@ struct x264_t /* space for p_fenc and p_fdec */ #define FENC_STRIDE 16 #define FDEC_STRIDE 32 - ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] ); - ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] ); + ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] ); + ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] ); /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ ALIGNED_16( pixel i4x4_fdec_buf[16*16] ); @@ -796,8 +796,8 @@ struct x264_t ALIGNED_16( dctcoef fenc_dct4[16][16] ); /* Psy RD SATD/SA8D scores cache */ - ALIGNED_N( uint64_t fenc_hadamard_cache[9] ); - ALIGNED_N( uint32_t fenc_satd_cache[32] ); + ALIGNED_32( uint64_t fenc_hadamard_cache[9] ); + ALIGNED_32( uint32_t fenc_satd_cache[32] ); /* pointer over mb of the frame to be compressed */ pixel *p_fenc[3]; /* y,u,v */ @@ -930,8 +930,8 @@ struct x264_t uint32_t (*nr_residual_sum)[64]; uint32_t *nr_count; - ALIGNED_N( udctcoef nr_offset_denoise[4][64] ); - ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] ); + ALIGNED_32( udctcoef nr_offset_denoise[4][64] ); + ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] ); uint32_t nr_count_buf[2][4]; uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */ diff --git a/common/macroblock.c b/common/macroblock.c index 661e6784..e5097a6d 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -121,8 +121,8 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int i_mode = x264_size2pixel[height][width]; intptr_t i_stride0 = 16, i_stride1 = 16; - ALIGNED_ARRAY_N( pixel, tmp0,[16*16] ); - ALIGNED_ARRAY_N( pixel, tmp1,[16*16] ); + ALIGNED_ARRAY_32( pixel, tmp0,[16*16] ); + ALIGNED_ARRAY_32( pixel, tmp1,[16*16] ); pixel *src0, *src1; MC_LUMA_BI( 0 ); diff --git a/common/osdep.h b/common/osdep.h index ed3ed597..95444018 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -108,10 +108,10 @@ int x264_is_pipe( const char *path ); #else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) #endif -#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) -#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) -#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) + #define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 ) +#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) +#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) // ARM compiliers don't reliably align stack variables // - EABI requires only 8 byte stack alignment to be maintained @@ -127,37 +127,31 @@ int x264_is_pipe( const char *path ); #if ARCH_ARM && SYS_MACOSX #define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) #else -#define ALIGNED_ARRAY_8( type, name, sub1, ... )\ - ALIGNED_8( type name sub1 __VA_ARGS__ ) +#define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ ) #endif #if ARCH_ARM #define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) #else -#define ALIGNED_ARRAY_16( type, name, sub1, ... )\ - ALIGNED_16( type name sub1 __VA_ARGS__ ) +#define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ ) #endif #define EXPAND(x) x +#if ARCH_X86 || ARCH_X86_64 +#define NATIVE_ALIGN 32 +#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) #if STACK_ALIGNMENT >= 32 -#define ALIGNED_ARRAY_32( type, name, sub1, ... )\ - ALIGNED_32( type name sub1 __VA_ARGS__ ) +#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ ) #else #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) ) #endif - #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) ) - -/* For AVX2 */ -#if ARCH_X86 || ARCH_X86_64 -#define NATIVE_ALIGN 32 -#define ALIGNED_N ALIGNED_32 -#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32 #else #define NATIVE_ALIGN 16 -#define ALIGNED_N ALIGNED_16 -#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16 +#define ALIGNED_32 ALIGNED_16 +#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16 +#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16 #endif #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) diff --git a/encoder/analyse.c b/encoder/analyse.c index 1941bf28..3fbdd53f 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1735,7 +1735,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size, int chroma ) { - ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); pixel *pix2 = pix1+8; int i_stride = h->mb.pic.i_stride[1]; int chroma_h_shift = chroma <= CHROMA_422; @@ -1919,8 +1919,8 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel ) { - ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] ); - ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] ); + ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] ); + ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] ); int i_chroma_cost = 0; int chromapix = h->luma2chroma_pixel[i_pixel]; @@ -2013,8 +2013,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { - ALIGNED_ARRAY_N( pixel, pix0,[16*16] ); - ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix0,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); pixel *src0, *src1; intptr_t stride0 = 16, stride1 = 16; int i_ref, i_mvc; @@ -2147,7 +2147,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } else { - ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] ); + ALIGNED_ARRAY_32( pixel, pixuv, [2],[16*FENC_STRIDE] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int v_shift = CHROMA_V_SHIFT; @@ -2483,7 +2483,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { - ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] ); + ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] ); ALIGNED_4( int16_t mvc[3][2] ); h->mb.i_partition = D_16x8; diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 87ba7f2d..87b076f5 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp ) pixel *p_src = h->mb.pic.p_fenc[p]; pixel *p_dst = h->mb.pic.p_fdec[p]; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] ); int nz, block_cbp = 0; int decimate_score = h->mb.b_dct_decimate ? 0 : 9; @@ -350,7 +350,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter int i_decimate_score = b_decimate ? 0 : 7; int nz_ac = 0; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { @@ -780,7 +780,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ } else if( h->mb.b_transform_8x8 ) { - ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] ); + ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] ); b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) @@ -824,7 +824,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ } else { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] ); for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; @@ -965,7 +965,7 @@ void x264_macroblock_encode( x264_t *h ) *****************************************************************************/ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] ); ALIGNED_ARRAY_16( dctcoef, dctscan,[16] ); ALIGNED_4( int16_t mvp[2] ); int i_qp = h->mb.i_qp; @@ -1219,7 +1219,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i int quant_cat = p ? CQM_8PC : CQM_8PY; pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; - ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 ); @@ -1252,7 +1252,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; int i_decimate_8x8 = b_decimate ? 0 : 4; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] ); int nnz8x8 = 0; h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); @@ -1311,7 +1311,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i i_qp = h->mb.i_chroma_qp; for( int ch = 0; ch < 2; ch++ ) { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] ); pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; @@ -1376,7 +1376,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i } else { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; diff --git a/encoder/macroblock.h b/encoder/macroblock.h index f8688021..9ab47006 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -116,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]]; pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]]; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] ); if( b_predict ) { @@ -154,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE]; pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE]; - ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] ); ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); if( b_predict ) diff --git a/encoder/me.c b/encoder/me.c index 310bff7f..58a39dcf 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -191,7 +191,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int omx, omy, pmx, pmy; pixel *p_fenc = m->p_fenc[0]; pixel *p_fref_w = m->p_fref_w; - ALIGNED_ARRAY_N( pixel, pix,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix,[16*16] ); ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] ); ALIGNED_ARRAY_16( int, costs,[16] ); @@ -875,7 +875,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite int chroma_v_shift = CHROMA_V_SHIFT; int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment + ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment ALIGNED_ARRAY_16( int, costs,[4] ); int bmx = m->mv[0]; @@ -1034,9 +1034,9 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] ); - ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] ); - ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] ); pixel *src[3][2][9]; int chromapix = h->luma2chroma_pixel[i_pixel]; int chroma_v_shift = CHROMA_V_SHIFT; @@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m uint64_t bcostrd = COST_MAX64; uint16_t amvd; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ - ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] ); + ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] ); /* all permutations of an offset in up to 2 of the dimensions */ ALIGNED_4( static const int8_t dia4d[33][4] ) = { diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index dbccb277..79e73878 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2 stride <<= b_field; if( b_chroma ) { - ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] ); + ALIGNED_ARRAY_32( pixel, pix,[FENC_STRIDE*16] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int shift = 7 - CHROMA_V_SHIFT; diff --git a/encoder/rdo.c b/encoder/rdo.c index cd766825..bd2eafb5 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -634,8 +634,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac, int b_chroma, int dc, int num_coefs, int idx ) { - ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] ); - ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] ); + ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] ); + ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] ); const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; diff --git a/tools/checkasm.c b/tools/checkasm.c index 44d0896a..c201193c 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -827,10 +827,10 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, interlace = 0; - ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] ); + ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] ); ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; x264_t *h = &h_buf; @@ -1925,7 +1925,7 @@ static int check_deblock( int cpu_ref, int cpu_new ) ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] ); ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] ); - ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] ); + ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] ); memset( bs, 99, sizeof(uint8_t)*2*4*8*2 ); for( int j = 0; j < X264_SCAN8_SIZE; j++ ) nnz[j] = ((rand()&7) == 7) * rand() & 0xf; @@ -1969,11 +1969,11 @@ static int check_quant( int cpu_ref, int cpu_new ) x264_quant_function_t qf_c; x264_quant_function_t qf_ref; x264_quant_function_t qf_a; - ALIGNED_ARRAY_N( dctcoef, dct1,[64] ); - ALIGNED_ARRAY_N( dctcoef, dct2,[64] ); - ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] ); - ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] ); + ALIGNED_ARRAY_32( dctcoef, dct1,[64] ); + ALIGNED_ARRAY_32( dctcoef, dct2,[64] ); + ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] ); + ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] ); int ret = 0, ok, used_asm; int oks[3] = {1,1,1}, used_asms[3] = {0,0,0}; x264_t h_buf; @@ -2587,7 +2587,7 @@ static int check_cabac( int cpu_ref, int cpu_new ) {\ for( int j = 0; j < 256; j++ )\ {\ - ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\ + ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\ uint8_t bitstream[2][1<<16];\ static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\ int ac = ctx_ac[ctx_block_cat];\