jg .height_loop
REP_RET
-cglobal x264_mc_copy_w16_sse2, 5,7
+%macro COPY_W16_SSE2 2
+cglobal %1, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm1, [r2+r3]
- movdqu xmm2, [r2+r3*2]
- movdqu xmm3, [r2+r6]
+ %2 xmm0, [r2]
+ %2 xmm1, [r2+r3]
+ %2 xmm2, [r2+r3*2]
+ %2 xmm3, [r2+r6]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+r1*2], xmm2
sub r4d, 4
jg .height_loop
REP_RET
+%endmacro
+
+COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
+COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
pf->mc_luma = mc_luma_sse2;
pf->get_ref = get_ref_sse2;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
}