* Mn: load or store n bits, aligned, native-endian
* CPn: copy n bits, aligned, native-endian
* we don't use memcpy for CPn because memcpy's args aren't assumed to be aligned */
-typedef union { uint16_t i; uint8_t c[2]; } MAY_ALIAS x264_union16_t;
-typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } MAY_ALIAS x264_union32_t;
-typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
+typedef union { uint16_t i; uint8_t b[2]; } MAY_ALIAS x264_union16_t;
+typedef union { uint32_t i; uint16_t w[2]; uint8_t b[4]; } MAY_ALIAS x264_union32_t;
+typedef union { uint64_t i; uint32_t d[2]; uint16_t w[4]; uint8_t b[8]; } MAY_ALIAS x264_union64_t;
typedef struct { uint64_t i[2]; } x264_uint128_t;
-typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
+typedef union { x264_uint128_t i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_t;
#define M16(src) (((x264_union16_t*)(src))->i)
#define M32(src) (((x264_union32_t*)(src))->i)
#define M64(src) (((x264_union64_t*)(src))->i)
#define MC_CLIP_ADD(s,x)\
do\
{\
- int temp;\
+ int temp_s = s;\
+ int temp_x = x;\
asm("movd %0, %%xmm0 \n"\
- "movd %2, %%xmm1 \n"\
+ "movd %1, %%xmm1 \n"\
"paddsw %%xmm1, %%xmm0 \n"\
- "movd %%xmm0, %1 \n"\
- :"+m"(s), "=&r"(temp)\
- :"m"(x)\
+ "movd %%xmm0, %0 \n"\
+ :"+&r"(temp_s)\
+ :"r"(temp_x)\
);\
- s = temp;\
+ s = temp_s;\
} while( 0 )
#undef MC_CLIP_ADD2
#define MC_CLIP_ADD2(s,x)\
do\
{\
+ x264_union32_t temp = { .w={ (s)[0], (s)[1] } };\
asm("movd %0, %%xmm0 \n"\
"movd %1, %%xmm1 \n"\
"paddsw %%xmm1, %%xmm0 \n"\
"movd %%xmm0, %0 \n"\
- :"+m"(M32(s))\
+ :"+&r"(temp)\
:"m"(M32(x))\
);\
+ (s)[0] = temp.w[0];\
+ (s)[1] = temp.w[1];\
} while( 0 )
#endif
#undef M128_ZERO
#define M128_ZERO ((__m128){0,0,0,0})
#define x264_union128_t x264_union128_sse_t
-typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
+typedef union { __m128 i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_sse_t;
#if HAVE_VECTOREXT
typedef uint32_t v4si __attribute__((vector_size (16)));
#endif
int i_cost4x8[4]; /* cost per 8x8 partition */
int i_cost16x8;
int i_cost8x16;
- /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
- ALIGNED_4( int16_t mvc[32][5][2] );
+ /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3], [ref][5] is for alignment */
+ ALIGNED_8( int16_t mvc[32][6][2] );
} x264_mb_analysis_list_t;
typedef struct
{
x264_me_t m;
int i_mvc;
- ALIGNED_4( int16_t mvc[8][2] );
+ ALIGNED_ARRAY_8( int16_t, mvc,[8],[2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = (a->b_early_terminate && h->mb.pic.i_fref[0]>1) ? &i_halfpel_thresh : NULL;
{
x264_me_t m;
pixel **p_fenc = h->mb.pic.p_fenc;
- ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] );
/* XXX Needed for x264_mb_predict_mv */
h->mb.i_partition = D_16x8;
{
x264_me_t m;
pixel **p_fenc = h->mb.pic.p_fenc;
- ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] );
/* XXX Needed for x264_mb_predict_mv */
h->mb.i_partition = D_8x16;
pixel *src0, *src1;
intptr_t stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
- ALIGNED_4( int16_t mvc[9][2] );
+ ALIGNED_ARRAY_8( int16_t, mvc,[9],[2] );
int try_skip = a->b_try_skip;
int list1_skipped = 0;
int i_halfpel_thresh[2] = {INT_MAX, INT_MAX};
static void mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] );
- ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] );
h->mb.i_partition = D_16x8;
a->i_cost16x8bi = 0;
static void mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
ALIGNED_ARRAY_16( pixel, pix,[2],[8*16] );
- ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_ARRAY_8( int16_t, mvc,[3],[2] );
h->mb.i_partition = D_8x16;
a->i_cost8x16bi = 0;
/* output */
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
- ALIGNED_4( int16_t mv[2] );
+ ALIGNED_8( int16_t mv[2] );
} ALIGNED_64( x264_me_t );
#define x264_me_search_ref x264_template(me_search_ref)
}
#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ )
+ uint64_t level_state0;
+ memcpy( &level_state0, cabac_state, sizeof(uint64_t) );
+ uint16_t level_state1;
+ memcpy( &level_state1, cabac_state+8, sizeof(uint16_t) );
#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
- cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
+ cabac_state_sig, cabac_state_last, level_state0, level_state1
if( num_coefs == 16 && !dc )
if( b_chroma || !h->mb.i_psy_trellis )
return h->quantf.trellis_cabac_4x4( TRELLIS_ARGS, b_ac );
{
int i_mvc = 0;
int16_t (*fenc_mv)[2] = fenc_mvs[l];
- ALIGNED_4( int16_t mvc[4][2] );
+ ALIGNED_ARRAY_8( int16_t, mvc,[4],[2] );
/* Reverse-order MV prediction. */
M32( mvc[0] ) = 0;
cextern_naked puts
; max number of args used by any x264 asm function.
-; (max_args % 4) must equal 3 for stack alignment
%define max_args 15
%if ARCH_X86_64
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal checkasm_call, 2,15,16,max_args*8+8
+cglobal checkasm_call, 2,15,16,-1*(((max_args+1)*8+STACK_ALIGNMENT-1) & ~(STACK_ALIGNMENT-1))
mov r6, r0
mov [rsp+max_args*8], r1
mov r5, r11mp
%assign i 6
%rep max_args-6
- mov r9, [rsp+stack_offset+(i+1)*8]
+ mov r9, [rstk+stack_offset+(i+1)*8]
mov [rsp+(i-6)*8], r9
%assign i i+1
%endrep
%else
%assign i 4
%rep max_args-4
- mov r9, [rsp+stack_offset+(i+7)*8]
+ mov r9, [rstk+stack_offset+(i+7)*8]
mov [rsp+i*8], r9
%assign i i+1
%endrep
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
-cglobal checkasm_call, 1,7
+cglobal checkasm_call, 2,7,0,-1*(((max_args+1)*4+STACK_ALIGNMENT-1) & ~(STACK_ALIGNMENT-1))
+ mov [esp+max_args*4], r1
+%assign i 0
+%rep max_args
+ mov r1, [rstk+stack_offset+12+i*4]
+ mov [esp+i*4], r1
+ %assign i i+1
+%endrep
mov r3, n3
mov r4, n4
mov r5, n5
mov r6, n6
-%rep max_args
- push dword [esp+24+max_args*4]
-%endrep
call r0
- add esp, max_args*4
xor r3, n3
xor r4, n4
xor r5, n5
mov r3, eax
mov r4, edx
lea r1, [error_message]
- push r1
+ mov [esp], r1
call puts
- add esp, 4
- mov r1, r1m
+ mov r1, [esp+max_args*4]
mov dword [r1], 0
mov edx, r4
mov eax, r3