From: Fiona Glaser Date: Sat, 7 Jun 2008 04:57:33 +0000 (-0600) Subject: avg_weight_sse2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=49ce3ac63b5305ca28f65bd75e6a4e6540d5954a;p=libx264 avg_weight_sse2 --- diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 957d03c0..2513014c 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -200,7 +200,7 @@ SECTION .text %macro SPLATW 1 %ifidn m0, xmm0 pshuflw %1, %1, 0 - punpcklqdq %1, %1 + movlhps %1, %1 %else pshufw %1, %1, 0 %endif diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 3dabe9f6..21c7b0d8 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -28,8 +28,8 @@ SECTION_RODATA pw_4: times 4 dw 4 pw_8: times 4 dw 8 -pw_32: times 4 dw 32 -pw_64: times 4 dw 64 +pw_32: times 8 dw 32 +pw_64: times 8 dw 64 sw_64: dd 64 SECTION .text @@ -483,33 +483,42 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa ; implicit bipred only: ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 -%macro BIWEIGHT_4P_MMX 2 - movd mm0, %1 - movd mm1, %2 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - pmullw mm0, mm4 - pmullw mm1, mm5 - paddw mm0, mm1 - paddw mm0, mm6 - psraw mm0, 6 - pmaxsw mm0, mm7 - packuswb mm0, mm0 - movd %1, mm0 +%macro SPLATW 2 +%if regsize==16 + pshuflw %1, %2, 0 + movlhps %1, %1 +%else + pshufw %1, %2, 0 +%endif +%endmacro + +%macro BIWEIGHT 2 + movh m0, %1 + movh m1, %2 + punpcklbw m0, m7 + punpcklbw m1, m7 + pmullw m0, m4 + pmullw m1, m5 + paddw m0, m1 + paddw m0, m6 + psraw m0, 6 + pmaxsw m0, m7 + packuswb m0, m0 + movh %1, m0 %endmacro -%macro BIWEIGHT_START_MMX 1 +%macro BIWEIGHT_START 1 %ifidn r4m, r4d - movd mm4, r4m - pshufw mm4, mm4, 0 ; weight_dst + movd m4, r4m + SPLATW m4, m4 ; weight_dst %else - pshufw mm4, r4m, 0 + SPLATW m4, r4m %endif picgetgot r4 - movq mm5, [pw_64 GLOBAL] - psubw mm5, mm4 ; weight_src - movq mm6, [pw_32 GLOBAL] ; rounding - pxor mm7, mm7 + mova m5, [pw_64 GLOBAL] + psubw m5, m4 ; weight_src + mova m6, [pw_32 GLOBAL] ; rounding + pxor m7, m7 %if %1 %ifidn r5m, r5d %define t0 r5d @@ -524,43 +533,37 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_w16_mmxext, 4,5 - BIWEIGHT_START_MMX 1 - BIWEIGHT_4P_MMX [r0 ], [r2 ] - BIWEIGHT_4P_MMX [r0+ 4], [r2+ 4] - BIWEIGHT_4P_MMX [r0+ 8], [r2+ 8] - BIWEIGHT_4P_MMX [r0+12], [r2+12] +cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1 + BIWEIGHT_START 0 + BIWEIGHT [r0 ], [r2 ] + BIWEIGHT [r0+r1 ], [r2+r3 ] + BIWEIGHT [r0+r1*2], [r2+r3*2] add r0, r1 add r2, r3 - dec t0 - jg .height_loop - REP_RET + BIWEIGHT [r0+r1*2], [r2+r3*2] + RET -;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_w8_mmxext, 4,5 - BIWEIGHT_START_MMX 1 - BIWEIGHT_4P_MMX [r0 ], [r2 ] - BIWEIGHT_4P_MMX [r0+4], [r2+4] +%macro AVG_WEIGHT 2 +cglobal x264_pixel_avg_weight_w%2_%1, 4,5 + BIWEIGHT_START 1 +%assign x 0 +%rep %2*2/regsize + BIWEIGHT [r0+x], [r2+x] +%assign x x+regsize/2 +%endrep add r0, r1 add r2, r3 dec t0 jg .height_loop REP_RET +%endmacro -;----------------------------------------------------------------------------- -; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) -;----------------------------------------------------------------------------- -cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1 - BIWEIGHT_START_MMX 0 - BIWEIGHT_4P_MMX [r0 ], [r2 ] - BIWEIGHT_4P_MMX [r0+r1 ], [r2+r3 ] - BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2] - add r0, r1 - add r2, r3 - BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2] - RET +INIT_MMX +AVG_WEIGHT mmxext, 8 +AVG_WEIGHT mmxext, 16 +INIT_XMM +AVG_WEIGHT sse2, 8 +AVG_WEIGHT sse2, 16 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 4a6194ae..45516065 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -46,6 +46,8 @@ extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); +extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, int, int ); +extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, int, int ); extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int ); extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_prefetch_ref_mmxext( uint8_t *, int, int ); @@ -70,16 +72,22 @@ PIXEL_AVG_WALL(cache64_mmxext) PIXEL_AVG_WALL(cache64_sse2) PIXEL_AVG_WALL(sse2) -#define AVG_WEIGHT(W,H) \ -void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \ +#define AVG_WEIGHT(W,H,name) \ +void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \ { \ - x264_pixel_avg_weight_w ## W ## _mmxext( dst, i_dst, src, i_src, i_weight_dst, H ); \ + x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src, i_src, i_weight_dst, H ); \ } -AVG_WEIGHT(16,16) -AVG_WEIGHT(16,8) -AVG_WEIGHT(8,16) -AVG_WEIGHT(8,8) -AVG_WEIGHT(8,4) + +AVG_WEIGHT(16,16,mmxext) +AVG_WEIGHT(16,8,mmxext) +AVG_WEIGHT(8,16,mmxext) +AVG_WEIGHT(8,8,mmxext) +AVG_WEIGHT(8,4,mmxext) +AVG_WEIGHT(16,16,sse2) +AVG_WEIGHT(16,8,sse2) +AVG_WEIGHT(8,16,sse2) +AVG_WEIGHT(8,8,sse2) +AVG_WEIGHT(8,4,sse2) #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\ @@ -244,7 +252,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmxext; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext; - + pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext; pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext; pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext; @@ -285,6 +293,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; + pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2; + pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2; + pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2; + pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2; + pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2; pf->hpel_filter = x264_hpel_filter_sse2; if( cpu&X264_CPU_SSE2_IS_FAST )