From: Henrik Gramner <henrik@gramner.com>
Date: Sun, 29 Jan 2017 15:41:33 +0000 (+0100)
Subject: osdep: Rework alignment macros
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d13b4c3a9574cd2fbd5407c7dfc58eeff72d2080;p=libx264

osdep: Rework alignment macros

Drop ALIGNED_N and ALIGNED_ARRAY_N in favor of using explicit alignment.

This will allow us to increase the native alignment without unnecessarily
increasing the alignment of everything that's currently 32-byte aligned.
---

diff --git a/common/common.h b/common/common.h
index c7850ca0..8cc1dc1e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -635,11 +635,11 @@ struct x264_t
     /* Current MB DCT coeffs */
     struct
     {
-        ALIGNED_N( dctcoef luma16x16_dc[3][16] );
+        ALIGNED_32( dctcoef luma16x16_dc[3][16] );
         ALIGNED_16( dctcoef chroma_dc[2][8] );
         // FIXME share memory?
-        ALIGNED_N( dctcoef luma8x8[12][64] );
-        ALIGNED_N( dctcoef luma4x4[16*3][16] );
+        ALIGNED_32( dctcoef luma8x8[12][64] );
+        ALIGNED_32( dctcoef luma4x4[16*3][16] );
     } dct;
 
     /* MB table and cache for current frame/mb */
@@ -778,8 +778,8 @@ struct x264_t
             /* space for p_fenc and p_fdec */
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
-            ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] );
-            ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
+            ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] );
+            ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] );
 
             /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
             ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
@@ -796,8 +796,8 @@ struct x264_t
             ALIGNED_16( dctcoef fenc_dct4[16][16] );
 
             /* Psy RD SATD/SA8D scores cache */
-            ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
-            ALIGNED_N( uint32_t fenc_satd_cache[32] );
+            ALIGNED_32( uint64_t fenc_hadamard_cache[9] );
+            ALIGNED_32( uint32_t fenc_satd_cache[32] );
 
             /* pointer over mb of the frame to be compressed */
             pixel *p_fenc[3]; /* y,u,v */
@@ -930,8 +930,8 @@ struct x264_t
     uint32_t (*nr_residual_sum)[64];
     uint32_t *nr_count;
 
-    ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
-    ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
+    ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
+    ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
     uint32_t nr_count_buf[2][4];
 
     uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
diff --git a/common/macroblock.c b/common/macroblock.c
index 661e6784..e5097a6d 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -121,8 +121,8 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
     int mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
     int i_mode = x264_size2pixel[height][width];
     intptr_t i_stride0 = 16, i_stride1 = 16;
-    ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
-    ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
+    ALIGNED_ARRAY_32( pixel, tmp0,[16*16] );
+    ALIGNED_ARRAY_32( pixel, tmp1,[16*16] );
     pixel *src0, *src1;
 
     MC_LUMA_BI( 0 );
diff --git a/common/osdep.h b/common/osdep.h
index ed3ed597..95444018 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -108,10 +108,10 @@ int x264_is_pipe( const char *path );
 #else
 #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
 #endif
-#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
-#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
-#define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
+
 #define ALIGNED_4( var )  DECLARE_ALIGNED( var, 4 )
+#define ALIGNED_8( var )  DECLARE_ALIGNED( var, 8 )
+#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
 
 // ARM compiliers don't reliably align stack variables
 // - EABI requires only 8 byte stack alignment to be maintained
@@ -127,37 +127,31 @@ int x264_is_pipe( const char *path );
 #if ARCH_ARM && SYS_MACOSX
 #define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
 #else
-#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
-    ALIGNED_8( type name sub1 __VA_ARGS__ )
+#define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ )
 #endif
 
 #if ARCH_ARM
 #define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
 #else
-#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
-    ALIGNED_16( type name sub1 __VA_ARGS__ )
+#define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ )
 #endif
 
 #define EXPAND(x) x
 
+#if ARCH_X86 || ARCH_X86_64
+#define NATIVE_ALIGN 32
+#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
 #if STACK_ALIGNMENT >= 32
-#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
-    ALIGNED_32( type name sub1 __VA_ARGS__ )
+#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
 #else
 #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
 #endif
-
 #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
-
-/* For AVX2 */
-#if ARCH_X86 || ARCH_X86_64
-#define NATIVE_ALIGN 32
-#define ALIGNED_N ALIGNED_32
-#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
 #else
 #define NATIVE_ALIGN 16
-#define ALIGNED_N ALIGNED_16
-#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
+#define ALIGNED_32 ALIGNED_16
+#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
+#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
 #endif
 
 #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 1941bf28..3fbdd53f 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1735,7 +1735,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i
 static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
                                                                      pixel **p_fref, int i8x8, int size, int chroma )
 {
-    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
+    ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
     pixel *pix2 = pix1+8;
     int i_stride = h->mb.pic.i_stride[1];
     int chroma_h_shift = chroma <= CHROMA_422;
@@ -1919,8 +1919,8 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
 
 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
 {
-    ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
-    ALIGNED_ARRAY_N( pixel,  bi, [2],[16*16] );
+    ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] );
+    ALIGNED_ARRAY_32( pixel,  bi, [2],[16*16] );
     int i_chroma_cost = 0;
     int chromapix = h->luma2chroma_pixel[i_pixel];
 
@@ -2013,8 +2013,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
 
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 {
-    ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
-    ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
+    ALIGNED_ARRAY_32( pixel, pix0,[16*16] );
+    ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
     pixel *src0, *src1;
     intptr_t stride0 = 16, stride1 = 16;
     int i_ref, i_mvc;
@@ -2147,7 +2147,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
             }
             else
             {
-                ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] );
+                ALIGNED_ARRAY_32( pixel, pixuv, [2],[16*FENC_STRIDE] );
                 int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
                 int v_shift = CHROMA_V_SHIFT;
 
@@ -2483,7 +2483,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
 
 static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
 {
-    ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
+    ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] );
     ALIGNED_4( int16_t mvc[3][2] );
 
     h->mb.i_partition = D_16x8;
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 87ba7f2d..87b076f5 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
     pixel *p_src = h->mb.pic.p_fenc[p];
     pixel *p_dst = h->mb.pic.p_fdec[p];
 
-    ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] );
 
     int nz, block_cbp = 0;
     int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
@@ -350,7 +350,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
         int i_decimate_score = b_decimate ? 0 : 7;
         int nz_ac = 0;
 
-        ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
+        ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
 
         if( h->mb.b_lossless )
         {
@@ -780,7 +780,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
         }
         else if( h->mb.b_transform_8x8 )
         {
-            ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
+            ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] );
             b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
 
             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
@@ -824,7 +824,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
         }
         else
         {
-            ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
+            ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] );
             for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
             {
                 int quant_cat = p ? CQM_4PC : CQM_4PY;
@@ -965,7 +965,7 @@ void x264_macroblock_encode( x264_t *h )
  *****************************************************************************/
 static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
 {
-    ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] );
     ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
     ALIGNED_4( int16_t mvp[2] );
     int i_qp = h->mb.i_qp;
@@ -1219,7 +1219,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
                 int quant_cat = p ? CQM_8PC : CQM_8PY;
                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
-                ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
+                ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
 
                 h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
                 int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
@@ -1252,7 +1252,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
                 pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
                 pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
                 int i_decimate_8x8 = b_decimate ? 0 : 4;
-                ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
+                ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] );
                 int nnz8x8 = 0;
 
                 h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
@@ -1311,7 +1311,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
             i_qp = h->mb.i_chroma_qp;
             for( int ch = 0; ch < 2; ch++ )
             {
-                ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
+                ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] );
                 pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
                 pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
 
@@ -1376,7 +1376,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
         }
         else
         {
-            ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
+            ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
             h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
             nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
             h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index f8688021..9ab47006 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -116,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
     pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
-    ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] );
 
     if( b_predict )
     {
@@ -154,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_
     int nz;
     pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
     pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
-    ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
+    ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] );
     ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
 
     if( b_predict )
diff --git a/encoder/me.c b/encoder/me.c
index 310bff7f..58a39dcf 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -191,7 +191,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
     int omx, omy, pmx, pmy;
     pixel *p_fenc = m->p_fenc[0];
     pixel *p_fref_w = m->p_fref_w;
-    ALIGNED_ARRAY_N( pixel, pix,[16*16] );
+    ALIGNED_ARRAY_32( pixel, pix,[16*16] );
     ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
 
     ALIGNED_ARRAY_16( int, costs,[16] );
@@ -875,7 +875,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     int chroma_v_shift = CHROMA_V_SHIFT;
     int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
 
-    ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
+    ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
     ALIGNED_ARRAY_16( int, costs,[4] );
 
     int bmx = m->mv[0];
@@ -1034,9 +1034,9 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     const int i_pixel = m0->i_pixel;
     const int bw = x264_pixel_size[i_pixel].w;
     const int bh = x264_pixel_size[i_pixel].h;
-    ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
-    ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
-    ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
+    ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] );
+    ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] );
+    ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] );
     pixel *src[3][2][9];
     int chromapix = h->luma2chroma_pixel[i_pixel];
     int chroma_v_shift = CHROMA_V_SHIFT;
@@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     uint64_t bcostrd = COST_MAX64;
     uint16_t amvd;
     /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
-    ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
+    ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] );
     /* all permutations of an offset in up to 2 of the dimensions */
     ALIGNED_4( static const int8_t dia4d[33][4] ) =
     {
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index dbccb277..79e73878 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
     stride <<= b_field;
     if( b_chroma )
     {
-        ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] );
+        ALIGNED_ARRAY_32( pixel, pix,[FENC_STRIDE*16] );
         int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
         int shift = 7 - CHROMA_V_SHIFT;
 
diff --git a/encoder/rdo.c b/encoder/rdo.c
index cd766825..bd2eafb5 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -634,8 +634,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
                          const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
                          int b_chroma, int dc, int num_coefs, int idx )
 {
-    ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
-    ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
+    ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] );
+    ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] );
     const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
     const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
     const int b_interlaced = MB_INTERLACED;
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 44d0896a..c201193c 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -827,10 +827,10 @@ static int check_dct( int cpu_ref, int cpu_new )
     x264_dct_function_t dct_asm;
     x264_quant_function_t qf;
     int ret = 0, ok, used_asm, interlace = 0;
-    ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
+    ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] );
     ALIGNED_16( dctcoef dctdc[2][8] );
     x264_t h_buf;
     x264_t *h = &h_buf;
@@ -1925,7 +1925,7 @@ static int check_deblock( int cpu_ref, int cpu_new )
             ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
             ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
             ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
-            ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
+            ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] );
             memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
             for( int j = 0; j < X264_SCAN8_SIZE; j++ )
                 nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
@@ -1969,11 +1969,11 @@ static int check_quant( int cpu_ref, int cpu_new )
     x264_quant_function_t qf_c;
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
-    ALIGNED_ARRAY_N( dctcoef, dct1,[64] );
-    ALIGNED_ARRAY_N( dctcoef, dct2,[64] );
-    ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] );
-    ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] );
-    ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] );
+    ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
+    ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
+    ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
+    ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
+    ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
     int ret = 0, ok, used_asm;
     int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
     x264_t h_buf;
@@ -2587,7 +2587,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
             {\
                 for( int j = 0; j < 256; j++ )\
                 {\
-                    ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
+                    ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
                     uint8_t bitstream[2][1<<16];\
                     static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
                     int ac = ctx_ac[ctx_block_cat];\