typedef struct
{
/* context */
- uint8_t state[460];
+ DECLARE_ALIGNED( uint8_t, state[460], 16 );
+
+ int f8_bits_encoded; // only if using x264_cabac_size_decision()
/* state */
int i_low;
/* bit stream */
int i_queue;
int i_bytes_outstanding;
- int f8_bits_encoded; // only if using x264_cabac_size_decision()
uint8_t *p_start;
uint8_t *p;
for( l = 0; l < 2; l++ )
for( i = 0; i < 4; i++ )
h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
- memcpy(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+ h->mc.memcpy_aligned(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
}
return b_available;
pf->prefetch_fenc = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
+ pf->memcpy_aligned = memcpy;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
uint8_t *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+
+ void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
} x264_mc_functions_t;
emms
RET
+;-----------------------------------------------------------------------------
+; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+cglobal x264_memcpy_aligned_mmx, 3,3
+ test r2d, 16
+ jz .copy32
+ sub r2d, 16
+ movq mm0, [r1 + r2 + 0]
+ movq mm1, [r1 + r2 + 8]
+ movq [r0 + r2 + 0], mm0
+ movq [r0 + r2 + 8], mm1
+.copy32:
+ sub r2d, 32
+ movq mm0, [r1 + r2 + 0]
+ movq mm1, [r1 + r2 + 8]
+ movq mm2, [r1 + r2 + 16]
+ movq mm3, [r1 + r2 + 24]
+ movq [r0 + r2 + 0], mm0
+ movq [r0 + r2 + 8], mm1
+ movq [r0 + r2 + 16], mm2
+ movq [r0 + r2 + 24], mm3
+ jg .copy32
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+cglobal x264_memcpy_aligned_sse2, 3,3
+ test r2d, 16
+ jz .copy32
+ sub r2d, 16
+ movdqa xmm0, [r1 + r2]
+ movdqa [r0 + r2], xmm0
+.copy32:
+ test r2d, 32
+ jz .copy64
+ sub r2d, 32
+ movdqa xmm0, [r1 + r2 + 0]
+ movdqa xmm1, [r1 + r2 + 16]
+ movdqa [r0 + r2 + 0], xmm0
+ movdqa [r0 + r2 + 16], xmm1
+.copy64:
+ sub r2d, 64
+ movdqa xmm0, [r1 + r2 + 0]
+ movdqa xmm1, [r1 + r2 + 16]
+ movdqa xmm2, [r1 + r2 + 32]
+ movdqa xmm3, [r1 + r2 + 48]
+ movdqa [r0 + r2 + 0], xmm0
+ movdqa [r0 + r2 + 16], xmm1
+ movdqa [r0 + r2 + 32], xmm2
+ movdqa [r0 + r2 + 48], xmm3
+ jg .copy64
+ REP_RET
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int i_stride, int i_width, int i_height );
+extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
+extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
+ pf->memcpy_aligned = x264_memcpy_aligned_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
- /* todo: use sse2 */
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+
+ pf->memcpy_aligned = x264_memcpy_aligned_sse2;
}
}
else if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_macroblock_size_cabac( h, &cabac_tmp );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+
x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}