From: Fiona Glaser Date: Wed, 19 Mar 2008 19:40:41 +0000 (-0600) Subject: get_ref_sse2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8773988471e5469ebd00841cccb4eee8bbdb54dd;p=libx264 get_ref_sse2 --- diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index e1d074a5..3d1db631 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -41,11 +41,11 @@ SECTION .text ; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, ; uint8_t *src, int src_stride ); ;----------------------------------------------------------------------------- -%macro AVGH 2 +%macro AVGH 3 %assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space -cglobal x264_pixel_avg_%1x%2_mmxext +cglobal x264_pixel_avg_%1x%2_%3 mov eax, %2 - jmp x264_pixel_avg_w%1_mmxext + jmp x264_pixel_avg_w%1_%3 %assign function_align 16 %endmacro @@ -95,9 +95,9 @@ AVG_START x264_pixel_avg_w4_mmxext movd [t0+t1], mm1 AVG_END -AVGH 4, 8 -AVGH 4, 4 -AVGH 4, 2 +AVGH 4, 8, mmxext +AVGH 4, 4, mmxext +AVGH 4, 2, mmxext AVG_START x264_pixel_avg_w8_mmxext movq mm0, [t2] @@ -108,9 +108,9 @@ AVG_START x264_pixel_avg_w8_mmxext movq [t0+t1], mm1 AVG_END -AVGH 8, 16 -AVGH 8, 8 -AVGH 8, 4 +AVGH 8, 16, mmxext +AVGH 8, 8, mmxext +AVGH 8, 4, mmxext AVG_START x264_pixel_avg_w16_mmxext movq mm0, [t2 ] @@ -127,8 +127,8 @@ AVG_START x264_pixel_avg_w16_mmxext movq [t0+t1+8], mm3 AVG_END -AVGH 16, 16 -AVGH 16, 8 +AVGH 16, 16, mmxext +AVGH 16, 8, mmxext AVG_START x264_pixel_avg_w16_sse2 movdqu xmm0, [t2] @@ -139,6 +139,9 @@ AVG_START x264_pixel_avg_w16_sse2 movdqa [t0+t1], xmm1 AVG_END +AVGH 16, 16, sse2 +AVGH 16, 8, sse2 + ;============================================================================= @@ -226,6 +229,48 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7 jg .height_loop REP_RET +cglobal x264_pixel_avg2_w16_sse2, 6,7 + sub r4, r2 + lea r6, [r4+r3] +.height_loop: + movdqu xmm0, [r2] + movdqu xmm2, [r2+r3] + movdqu xmm1, [r2+r4] + movdqu xmm3, [r2+r6] + pavgb xmm0, xmm1 + pavgb xmm2, xmm3 + movdqa [r0], xmm0 + movdqa [r0+r1], xmm2 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + REP_RET + +cglobal x264_pixel_avg2_w20_sse2, 6,7 + sub r4, r2 + lea r6, [r4+r3] +.height_loop: + movdqu xmm0, [r2] + movdqu xmm2, [r2+r3] + movdqu xmm1, [r2+r4] + movdqu xmm3, [r2+r6] + movd mm4, [r2+16] + movd mm5, [r2+r3+16] + pavgb xmm0, xmm1 + pavgb xmm2, xmm3 + pavgb mm4, [r2+r4+16] + pavgb mm5, [r2+r6+16] + movdqa [r0], xmm0 + movd [r0+16], mm4 + movdqa [r0+r1], xmm2 + movd [r0+r1+16], mm5 + lea r2, [r2+r3*2] + lea r0, [r0+r1*2] + sub r5d, 2 + jg .height_loop + REP_RET + ;============================================================================= @@ -296,6 +341,24 @@ cglobal x264_mc_copy_w16_mmx, 5,7 jg .height_loop REP_RET +cglobal x264_mc_copy_w16_sse2,5,7 + lea r6, [r3*3] + lea r5, [r1*3] +.height_loop + movdqu xmm0, [r2] + movdqu xmm1, [r2+r3] + movdqu xmm2, [r2+r3*2] + movdqu xmm3, [r2+r6] + movdqa [r0], xmm0 + movdqa [r0+r1], xmm1 + movdqa [r0+r1*2], xmm2 + movdqa [r0+r5], xmm3 + lea r2, [r2+r3*4] + lea r0, [r0+r1*4] + sub r4d, 4 + jg .height_loop + REP_RET + ;============================================================================= diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index bf3090e2..229f5c98 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -374,18 +374,18 @@ cglobal x264_memcpy_aligned_sse2, 3,3 jz .copy64 sub r2d, 32 movdqa xmm0, [r1 + r2 + 0] - movdqa xmm1, [r1 + r2 + 16] movdqa [r0 + r2 + 0], xmm0 + movdqa xmm1, [r1 + r2 + 16] movdqa [r0 + r2 + 16], xmm1 .copy64: sub r2d, 64 movdqa xmm0, [r1 + r2 + 0] - movdqa xmm1, [r1 + r2 + 16] - movdqa xmm2, [r1 + r2 + 32] - movdqa xmm3, [r1 + r2 + 48] movdqa [r0 + r2 + 0], xmm0 + movdqa xmm1, [r1 + r2 + 16] movdqa [r0 + r2 + 16], xmm1 + movdqa xmm2, [r1 + r2 + 32] movdqa [r0 + r2 + 32], xmm2 + movdqa xmm3, [r1 + r2 + 48] movdqa [r0 + r2 + 48], xmm3 jg .copy64 REP_RET diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index ec343ded..5d855dea 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -28,6 +28,8 @@ #include "common/common.h" /* NASM functions */ +extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int ); +extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int ); @@ -41,6 +43,8 @@ extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t * extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int ); extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int ); extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int ); @@ -87,56 +91,78 @@ static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, NULL, x264_mc_copy_w16_mmx }; +static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) = +{ + NULL, + x264_pixel_avg2_w4_mmxext, + x264_pixel_avg2_w8_mmxext, + x264_pixel_avg2_w12_mmxext, + x264_pixel_avg2_w16_sse2, + x264_pixel_avg2_w20_sse2, +}; +static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) = +{ + NULL, + x264_mc_copy_w4_mmx, + x264_mc_copy_w8_mmx, + NULL, + x264_mc_copy_w16_sse2, +}; static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; -void mc_luma_mmxext( uint8_t *dst, int i_dst_stride, - uint8_t *src[4], int i_src_stride, - int mvx, int mvy, - int i_width, int i_height ) -{ - int qpel_idx = ((mvy&3)<<2) + (mvx&3); - int offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; - - if( qpel_idx & 5 ) /* qpel interpolation needed */ - { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); - x264_pixel_avg_wtab_mmxext[i_width>>2]( - dst, i_dst_stride, src1, i_src_stride, - src2, i_height ); - } - else - { - x264_mc_copy_wtab_mmx[i_width>>2]( - dst, i_dst_stride, src1, i_src_stride, i_height ); - } +#define MC_LUMA(name,instr1,instr2)\ +void mc_luma_##name( uint8_t *dst, int i_dst_stride,\ + uint8_t *src[4], int i_src_stride,\ + int mvx, int mvy,\ + int i_width, int i_height )\ +{\ + int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ + int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ + if( qpel_idx & 5 ) /* qpel interpolation needed */\ + {\ + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ + x264_pixel_avg_wtab_##instr1[i_width>>2](\ + dst, i_dst_stride, src1, i_src_stride,\ + src2, i_height );\ + }\ + else\ + {\ + x264_mc_copy_wtab_##instr2[i_width>>2](\ + dst, i_dst_stride, src1, i_src_stride, i_height );\ + }\ } -uint8_t *get_ref_mmxext( uint8_t *dst, int *i_dst_stride, - uint8_t *src[4], int i_src_stride, - int mvx, int mvy, - int i_width, int i_height ) -{ - int qpel_idx = ((mvy&3)<<2) + (mvx&3); - int offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; - - if( qpel_idx & 5 ) /* qpel interpolation needed */ - { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); - x264_pixel_avg_wtab_mmxext[i_width>>2]( - dst, *i_dst_stride, src1, i_src_stride, - src2, i_height ); - return dst; - } - else - { - *i_dst_stride = i_src_stride; - return src1; - } +MC_LUMA(mmxext,mmxext,mmx) +MC_LUMA(sse2,sse2,sse2) + +#define GET_REF(name)\ +uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\ + uint8_t *src[4], int i_src_stride,\ + int mvx, int mvy,\ + int i_width, int i_height )\ +{\ + int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ + int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ + if( qpel_idx & 5 ) /* qpel interpolation needed */\ + {\ + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ + x264_pixel_avg_wtab_##name[i_width>>2](\ + dst, *i_dst_stride, src1, i_src_stride,\ + src2, i_height );\ + return dst;\ + }\ + else\ + {\ + *i_dst_stride = i_src_stride;\ + return src1;\ + }\ } +GET_REF(mmxext) +GET_REF(sse2) void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { @@ -180,6 +206,15 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_SSE2) ) return; - + pf->memcpy_aligned = x264_memcpy_aligned_sse2; + + // disable on AMD processors since it is slower + if( cpu&X264_CPU_3DNOW ) + return; + + pf->mc_luma = mc_luma_sse2; + pf->get_ref = get_ref_sse2; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; }