From: Loren Merritt Date: Sat, 1 Oct 2005 06:48:13 +0000 (+0000) Subject: mmx avg (already existed by not used for bipred) X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=458e63cadb0c6295273fd85def3aca0098a309e3;p=libx264 mmx avg (already existed by not used for bipred) mmx biweighted avg (3x faster than C) git-svn-id: svn://svn.videolan.org/x264/trunk@307 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm index ae4906c2..0d4f705d 100644 --- a/common/amd64/mc-a.asm +++ b/common/amd64/mc-a.asm @@ -74,6 +74,10 @@ cglobal x264_pixel_avg_w8_mmxext cglobal x264_pixel_avg_w16_mmxext cglobal x264_pixel_avg_w16_sse2 +cglobal x264_pixel_avg_weight_4x4_mmxext +cglobal x264_pixel_avg_weight_w8_mmxext +cglobal x264_pixel_avg_weight_w16_mmxext + cglobal x264_mc_copy_w4_mmxext cglobal x264_mc_copy_w8_mmxext cglobal x264_mc_copy_w16_mmxext @@ -247,6 +251,98 @@ ALIGN 4 +;============================================================================= +; weighted prediction +;============================================================================= +; implicit bipred only: +; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 + +%macro BIWEIGHT_4P_MMX 2 + movd mm0, %1 + movd mm1, %2 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + pmullw mm0, mm4 + pmullw mm1, mm5 + paddw mm0, mm1 + paddw mm0, mm6 + psraw mm0, 6 + pmaxsw mm0, mm7 + packuswb mm0, mm0 + movd %1, mm0 +%endmacro + +%macro BIWEIGHT_START_MMX 0 +; mov rdi, rdi ; dst + movsxd rsi, esi ; i_dst +; mov rdx, rdx ; src + movsxd rcx, ecx ; i_src +; movsxd r8, r8d ; i_weight_dst +; movsxd r9, r9d ; i_height + + movd mm4, r8d + pshufw mm4, mm4, 0 ; weight_dst + movq mm5, [pw_64] + psubw mm5, mm4 ; weight_src + movq mm6, [pw_32] ; rounding + pxor mm7, mm7 + + ALIGN 4 + .height_loop +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_avg_weight_w16_mmxext: + BIWEIGHT_START_MMX + + BIWEIGHT_4P_MMX [rdi ], [rdx ] + BIWEIGHT_4P_MMX [rdi+ 4], [rdx+ 4] + BIWEIGHT_4P_MMX [rdi+ 8], [rdx+ 8] + BIWEIGHT_4P_MMX [rdi+12], [rdx+12] + + add rdi, rsi + add rdx, rcx + dec r9d + jnz .height_loop + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_avg_weight_w8_mmxext: + BIWEIGHT_START_MMX + + BIWEIGHT_4P_MMX [rdi ], [rdx ] + BIWEIGHT_4P_MMX [rdi+4 ], [rdx+4 ] + BIWEIGHT_4P_MMX [rdi+rsi ], [rdx+rcx ] + BIWEIGHT_4P_MMX [rdi+rsi+4], [rdx+rcx+4] + + lea rdi, [rdi+rsi*2] + lea rdx, [rdx+rcx*2] + sub r9d, byte 2 + jnz .height_loop + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_avg_weight_4x4_mmxext: + BIWEIGHT_START_MMX + BIWEIGHT_4P_MMX [rdi ], [rdx ] + BIWEIGHT_4P_MMX [rdi+rsi ], [rdx+rcx ] + BIWEIGHT_4P_MMX [rdi+rsi*2], [rdx+rcx*2] + add rdi, rsi + add rdx, rcx + BIWEIGHT_4P_MMX [rdi+rsi*2], [rdx+rcx*2] + ret + + + ;============================================================================= ; pixel copy ;============================================================================= diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index db50294d..55c383fe 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -72,6 +72,10 @@ cglobal x264_pixel_avg_w8_mmxext cglobal x264_pixel_avg_w16_mmxext cglobal x264_pixel_avg_w16_sse2 +cglobal x264_pixel_avg_weight_4x4_mmxext +cglobal x264_pixel_avg_weight_w8_mmxext +cglobal x264_pixel_avg_weight_w16_mmxext + cglobal x264_mc_copy_w4_mmxext cglobal x264_mc_copy_w8_mmxext cglobal x264_mc_copy_w16_mmxext @@ -244,6 +248,105 @@ ALIGN 4 ret +;============================================================================= +; weighted prediction +;============================================================================= +; implicit bipred only: +; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 + +%macro BIWEIGHT_4P_MMX 2 + movd mm0, %1 + movd mm1, %2 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + pmullw mm0, mm4 + pmullw mm1, mm5 + paddw mm0, mm1 + paddw mm0, mm6 + psraw mm0, 6 + pmaxsw mm0, mm7 + packuswb mm0, mm0 + movd %1, mm0 +%endmacro + +%macro BIWEIGHT_START_MMX 0 + push edi + push esi + mov edi, [esp+12] ; dst + mov esi, [esp+16] ; i_dst + mov edx, [esp+20] ; src + mov ecx, [esp+24] ; i_src + + pshufw mm4, [esp+28], 0 ; weight_dst + movq mm5, [pw_64] + psubw mm5, mm4 ; weight_src + movq mm6, [pw_32] ; rounding + pxor mm7, mm7 +%endmacro +%macro BIWEIGHT_END_MMX 0 + pop esi + pop edi + ret +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_avg_weight_w16_mmxext: + BIWEIGHT_START_MMX + mov eax, [esp+32] ; i_height + ALIGN 4 + .height_loop + + BIWEIGHT_4P_MMX [edi ], [edx ] + BIWEIGHT_4P_MMX [edi+ 4], [edx+ 4] + BIWEIGHT_4P_MMX [edi+ 8], [edx+ 8] + BIWEIGHT_4P_MMX [edi+12], [edx+12] + + add edi, esi + add edx, ecx + dec eax + jnz .height_loop + BIWEIGHT_END_MMX + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_avg_weight_w8_mmxext: + BIWEIGHT_START_MMX + mov eax, [esp+32] + ALIGN 4 + .height_loop + + BIWEIGHT_4P_MMX [edi ], [edx ] + BIWEIGHT_4P_MMX [edi+4 ], [edx+4 ] + BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ] + BIWEIGHT_4P_MMX [edi+esi+4], [edx+ecx+4] + + lea edi, [edi+esi*2] + lea edx, [edi+ecx*2] + sub eax, byte 2 + jnz .height_loop + BIWEIGHT_END_MMX + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) +;----------------------------------------------------------------------------- +x264_pixel_avg_weight_4x4_mmxext: + BIWEIGHT_START_MMX + BIWEIGHT_4P_MMX [edi ], [edx ] + BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ] + BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2] + add edi, esi + add edx, ecx + BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2] + BIWEIGHT_END_MMX + + + ;============================================================================= ; pixel copy ;============================================================================= diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c index 9000f103..7af01c46 100644 --- a/common/i386/mc-c.c +++ b/common/i386/mc-c.c @@ -42,11 +42,39 @@ extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t * extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); +extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ); extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int ); +#define AVG(W,H) \ +static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \ +{ \ + x264_pixel_avg_w ## W ## _mmxext( dst, i_dst, dst, i_dst, src, i_src, H ); \ +} +AVG(16,16) +AVG(16,8) +AVG(8,16) +AVG(8,8) +AVG(8,4) +AVG(4,8) +AVG(4,4) +AVG(4,2) + +#define AVG_WEIGHT(W,H) \ +void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \ +{ \ + x264_pixel_avg_weight_w ## W ## _mmxext( dst, i_dst, src, i_src, i_weight_dst, H ); \ +} +AVG_WEIGHT(16,16) +AVG_WEIGHT(16,8) +AVG_WEIGHT(8,16) +AVG_WEIGHT(8,8) +AVG_WEIGHT(8,4) + #if 0 #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) @@ -1128,6 +1156,23 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma_mmx; pf->get_ref = get_ref_mmx; + + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmxext; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmxext; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmxext; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; + + pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext; + pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext; + pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext; + pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext; + pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext; + pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext; + // avg_weight_4x8 is rare and 4x2 is not used } void x264_mc_sse2_init( x264_mc_functions_t *pf ) {