cglobal x264_pixel_avg_w16_mmxext
cglobal x264_pixel_avg_w16_sse2
+cglobal x264_pixel_avg_weight_4x4_mmxext
+cglobal x264_pixel_avg_weight_w8_mmxext
+cglobal x264_pixel_avg_weight_w16_mmxext
+
cglobal x264_mc_copy_w4_mmxext
cglobal x264_mc_copy_w8_mmxext
cglobal x264_mc_copy_w16_mmxext
+;=============================================================================
+; weighted prediction
+;=============================================================================
+; implicit bipred only:
+; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
+
+%macro BIWEIGHT_4P_MMX 2
+ movd mm0, %1
+ movd mm1, %2
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ pmullw mm0, mm4
+ pmullw mm1, mm5
+ paddw mm0, mm1
+ paddw mm0, mm6
+ psraw mm0, 6
+ pmaxsw mm0, mm7
+ packuswb mm0, mm0
+ movd %1, mm0
+%endmacro
+
+%macro BIWEIGHT_START_MMX 0
+; mov rdi, rdi ; dst
+ movsxd rsi, esi ; i_dst
+; mov rdx, rdx ; src
+ movsxd rcx, ecx ; i_src
+; movsxd r8, r8d ; i_weight_dst
+; movsxd r9, r9d ; i_height
+
+ movd mm4, r8d
+ pshufw mm4, mm4, 0 ; weight_dst
+ movq mm5, [pw_64]
+ psubw mm5, mm4 ; weight_src
+ movq mm6, [pw_32] ; rounding
+ pxor mm7, mm7
+
+ ALIGN 4
+ .height_loop
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int )
+;-----------------------------------------------------------------------------
+x264_pixel_avg_weight_w16_mmxext:
+ BIWEIGHT_START_MMX
+
+ BIWEIGHT_4P_MMX [rdi ], [rdx ]
+ BIWEIGHT_4P_MMX [rdi+ 4], [rdx+ 4]
+ BIWEIGHT_4P_MMX [rdi+ 8], [rdx+ 8]
+ BIWEIGHT_4P_MMX [rdi+12], [rdx+12]
+
+ add rdi, rsi
+ add rdx, rcx
+ dec r9d
+ jnz .height_loop
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
+;-----------------------------------------------------------------------------
+x264_pixel_avg_weight_w8_mmxext:
+ BIWEIGHT_START_MMX
+
+ BIWEIGHT_4P_MMX [rdi ], [rdx ]
+ BIWEIGHT_4P_MMX [rdi+4 ], [rdx+4 ]
+ BIWEIGHT_4P_MMX [rdi+rsi ], [rdx+rcx ]
+ BIWEIGHT_4P_MMX [rdi+rsi+4], [rdx+rcx+4]
+
+ lea rdi, [rdi+rsi*2]
+ lea rdx, [rdx+rcx*2]
+ sub r9d, byte 2
+ jnz .height_loop
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
+;-----------------------------------------------------------------------------
+x264_pixel_avg_weight_4x4_mmxext:
+ BIWEIGHT_START_MMX
+ BIWEIGHT_4P_MMX [rdi ], [rdx ]
+ BIWEIGHT_4P_MMX [rdi+rsi ], [rdx+rcx ]
+ BIWEIGHT_4P_MMX [rdi+rsi*2], [rdx+rcx*2]
+ add rdi, rsi
+ add rdx, rcx
+ BIWEIGHT_4P_MMX [rdi+rsi*2], [rdx+rcx*2]
+ ret
+
+
+
;=============================================================================
; pixel copy
;=============================================================================
cglobal x264_pixel_avg_w16_mmxext
cglobal x264_pixel_avg_w16_sse2
+cglobal x264_pixel_avg_weight_4x4_mmxext
+cglobal x264_pixel_avg_weight_w8_mmxext
+cglobal x264_pixel_avg_weight_w16_mmxext
+
cglobal x264_mc_copy_w4_mmxext
cglobal x264_mc_copy_w8_mmxext
cglobal x264_mc_copy_w16_mmxext
ret
+;=============================================================================
+; weighted prediction
+;=============================================================================
+; implicit bipred only:
+; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
+
+%macro BIWEIGHT_4P_MMX 2
+ movd mm0, %1
+ movd mm1, %2
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ pmullw mm0, mm4
+ pmullw mm1, mm5
+ paddw mm0, mm1
+ paddw mm0, mm6
+ psraw mm0, 6
+ pmaxsw mm0, mm7
+ packuswb mm0, mm0
+ movd %1, mm0
+%endmacro
+
+%macro BIWEIGHT_START_MMX 0
+ push edi
+ push esi
+ mov edi, [esp+12] ; dst
+ mov esi, [esp+16] ; i_dst
+ mov edx, [esp+20] ; src
+ mov ecx, [esp+24] ; i_src
+
+ pshufw mm4, [esp+28], 0 ; weight_dst
+ movq mm5, [pw_64]
+ psubw mm5, mm4 ; weight_src
+ movq mm6, [pw_32] ; rounding
+ pxor mm7, mm7
+%endmacro
+%macro BIWEIGHT_END_MMX 0
+ pop esi
+ pop edi
+ ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int )
+;-----------------------------------------------------------------------------
+x264_pixel_avg_weight_w16_mmxext:
+ BIWEIGHT_START_MMX
+ mov eax, [esp+32] ; i_height
+ ALIGN 4
+ .height_loop
+
+ BIWEIGHT_4P_MMX [edi ], [edx ]
+ BIWEIGHT_4P_MMX [edi+ 4], [edx+ 4]
+ BIWEIGHT_4P_MMX [edi+ 8], [edx+ 8]
+ BIWEIGHT_4P_MMX [edi+12], [edx+12]
+
+ add edi, esi
+ add edx, ecx
+ dec eax
+ jnz .height_loop
+ BIWEIGHT_END_MMX
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
+;-----------------------------------------------------------------------------
+x264_pixel_avg_weight_w8_mmxext:
+ BIWEIGHT_START_MMX
+ mov eax, [esp+32]
+ ALIGN 4
+ .height_loop
+
+ BIWEIGHT_4P_MMX [edi ], [edx ]
+ BIWEIGHT_4P_MMX [edi+4 ], [edx+4 ]
+ BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ]
+ BIWEIGHT_4P_MMX [edi+esi+4], [edx+ecx+4]
+
+ lea edi, [edi+esi*2]
+ lea edx, [edi+ecx*2]
+ sub eax, byte 2
+ jnz .height_loop
+ BIWEIGHT_END_MMX
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
+;-----------------------------------------------------------------------------
+x264_pixel_avg_weight_4x4_mmxext:
+ BIWEIGHT_START_MMX
+ BIWEIGHT_4P_MMX [edi ], [edx ]
+ BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ]
+ BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2]
+ add edi, esi
+ add edx, ecx
+ BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2]
+ BIWEIGHT_END_MMX
+
+
+
;=============================================================================
; pixel copy
;=============================================================================
extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+#define AVG(W,H) \
+static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
+{ \
+ x264_pixel_avg_w ## W ## _mmxext( dst, i_dst, dst, i_dst, src, i_src, H ); \
+}
+AVG(16,16)
+AVG(16,8)
+AVG(8,16)
+AVG(8,8)
+AVG(8,4)
+AVG(4,8)
+AVG(4,4)
+AVG(4,2)
+
+#define AVG_WEIGHT(W,H) \
+void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
+{ \
+ x264_pixel_avg_weight_w ## W ## _mmxext( dst, i_dst, src, i_src, i_weight_dst, H ); \
+}
+AVG_WEIGHT(16,16)
+AVG_WEIGHT(16,8)
+AVG_WEIGHT(8,16)
+AVG_WEIGHT(8,8)
+AVG_WEIGHT(8,4)
+
#if 0
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
{
pf->mc_luma = mc_luma_mmx;
pf->get_ref = get_ref_mmx;
+
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmxext;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmxext;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmxext;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
+
+ pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
+ pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext;
+ pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext;
+ pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext;
+ pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext;
+ pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext;
+ // avg_weight_4x8 is rare and 4x2 is not used
}
void x264_mc_sse2_init( x264_mc_functions_t *pf )
{