From dfb854775c7b52945a84ef756dc88a4ccb7c2d2c Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Mon, 2 Apr 2007 23:56:09 +0000 Subject: [PATCH] in hpel search, merge two 16x16 mc calls into one 16x17. 15% faster hpel, .3% overall. git-svn-id: svn://svn.videolan.org/x264/trunk@638 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/amd64inc.asm | 10 ++++++ common/amd64/mc-a.asm | 51 +++++++++++++++++++++++------ common/i386/mc-a.asm | 68 +++++++++++++++++++++++++++++++++------ common/i386/mc-c.c | 8 +++-- common/mc.h | 1 + common/ppc/mc.c | 9 ++++++ encoder/me.c | 20 ++++-------- 7 files changed, 130 insertions(+), 37 deletions(-) diff --git a/common/amd64/amd64inc.asm b/common/amd64/amd64inc.asm index 44422789..e9409965 100644 --- a/common/amd64/amd64inc.asm +++ b/common/amd64/amd64inc.asm @@ -78,6 +78,11 @@ BITS 64 %define parm7d dword parm7q %define parm8d dword parm8q +%define temp1q rdi +%define temp2q rsi +%define temp1d edi +%define temp2d esi + %macro firstpush 1 db 0x48 push %1 @@ -234,6 +239,11 @@ SECTION .text %define parm7d dword parm7q %define parm8d dword parm8q +%define temp1q r9 +%define temp2q r8 +%define temp1d r9d +%define temp2d r8d + %macro allocstack 1 %endmacro diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm index 8ae1416b..7e0bfa27 100644 --- a/common/amd64/mc-a.asm +++ b/common/amd64/mc-a.asm @@ -59,6 +59,7 @@ SECTION .text cglobal x264_pixel_avg_w4_mmxext cglobal x264_pixel_avg_w8_mmxext cglobal x264_pixel_avg_w16_mmxext +cglobal x264_pixel_avg_w20_mmxext cglobal x264_pixel_avg_w16_sse2 cglobal x264_pixel_avg_weight_4x4_mmxext @@ -103,7 +104,7 @@ ALIGN 4 lea parm3q, [parm3q+parm4q*2] lea r10, [r10+r11*2] lea parm1q, [parm1q+parm2q*2] - jne .height_loop + jg .height_loop rep ret @@ -132,7 +133,7 @@ ALIGN 4 lea parm3q, [parm3q+parm4q*2] lea r10, [r10+r11*2] lea parm1q, [parm1q+parm2q*2] - jne .height_loop + jg .height_loop rep ret ALIGN 16 @@ -159,7 +160,37 @@ ALIGN 4 lea parm3q, [parm3q+parm4q] lea r10, [r10+r11] lea parm1q, [parm1q+parm2q] - jne .height_loop + jg .height_loop + rep ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +x264_pixel_avg_w20_mmxext: + mov r10, parm5q ; src2 + movsxd r11, parm6d ; i_src2_stride + mov eax, parm7d ; i_height + +ALIGN 4 +.height_loop + movq mm0, [parm3q ] + movq mm1, [parm3q+8 ] + movd mm2, [parm3q+16] + pavgb mm0, [r10 ] + pavgb mm1, [r10+8 ] + pavgb mm2, [r10+16] + movq [parm1q ], mm0 + movq [parm1q+8 ], mm1 + movd [parm1q+16], mm2 + dec eax + lea parm3q, [parm3q+parm4q] + lea r10, [r10+r11] + lea parm1q, [parm1q+parm2q] + jg .height_loop rep ret ALIGN 16 @@ -183,7 +214,7 @@ ALIGN 4 lea parm3q, [parm3q+parm4q] lea r10, [r10+r11] lea parm1q, [parm1q+parm2q] - jne .height_loop + jg .height_loop rep ret @@ -244,7 +275,7 @@ x264_pixel_avg_weight_w16_mmxext: add parm1q, parm2q add parm3q, parm4q dec r11d - jnz .height_loop + jg .height_loop rep ret ALIGN 16 @@ -260,7 +291,7 @@ x264_pixel_avg_weight_w8_mmxext: add parm1q, parm2q add parm3q, parm4q dec r11d - jnz .height_loop + jg .height_loop rep ret ALIGN 16 @@ -301,7 +332,7 @@ ALIGN 4 lea parm1q, [parm1q+parm2q*2] dec eax dec eax - jne .height_loop + jg .height_loop rep ret ALIGN 16 @@ -329,7 +360,7 @@ ALIGN 4 lea parm1q, [parm1q+parm2q*4] sub eax, byte 4 - jnz .height_loop + jg .height_loop rep ret ALIGN 16 @@ -364,7 +395,7 @@ ALIGN 4 lea parm3q, [parm3q+parm4q*4] lea parm1q, [parm1q+parm2q*4] sub eax, byte 4 - jnz .height_loop + jg .height_loop rep ret @@ -384,7 +415,7 @@ ALIGN 4 sub eax, byte 2 lea parm3q, [parm3q+parm4q*2] lea parm1q, [parm1q+parm2q*2] - jnz .height_loop + jg .height_loop rep ret diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm index 6f233c4c..2e68b994 100644 --- a/common/i386/mc-a.asm +++ b/common/i386/mc-a.asm @@ -59,6 +59,7 @@ SECTION .text cglobal x264_pixel_avg_w4_mmxext cglobal x264_pixel_avg_w8_mmxext cglobal x264_pixel_avg_w16_mmxext +cglobal x264_pixel_avg_w20_mmxext cglobal x264_pixel_avg_w16_sse2 cglobal x264_pixel_avg_weight_4x4_mmxext @@ -112,7 +113,7 @@ ALIGN 4 lea ebx, [ebx+eax*2] lea ecx, [ecx+edx*2] lea edi, [edi+esi*2] - jne .height_loop + jg .height_loop pop edi pop esi @@ -151,7 +152,7 @@ ALIGN 4 lea ebx, [ebx+eax] lea ecx, [ecx+edx] lea edi, [edi+esi] - jne .height_loop + jg .height_loop pop edi pop esi @@ -193,7 +194,7 @@ ALIGN 4 lea ebx, [ebx+eax] lea ecx, [ecx+edx] lea edi, [edi+esi] - jne .height_loop + jg .height_loop pop edi pop esi @@ -201,6 +202,53 @@ ALIGN 4 pop ebp ret + + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride, +; uint8_t *src1, int i_src1_stride, +; uint8_t *src2, int i_src2_stride, +; int i_height ); +;----------------------------------------------------------------------------- +x264_pixel_avg_w20_mmxext: + push ebp + push ebx + push esi + push edi + + mov edi, [esp+20] ; dst + mov ebx, [esp+28] ; src1 + mov ecx, [esp+36] ; src2 + mov esi, [esp+24] ; i_dst_stride + mov eax, [esp+32] ; i_src1_stride + mov edx, [esp+40] ; i_src2_stride + mov ebp, [esp+44] ; i_height +ALIGN 4 +.height_loop + movq mm0, [ebx ] + movq mm1, [ebx+8 ] + movd mm2, [ebx+16] + pavgb mm0, [ecx ] + pavgb mm1, [ecx+8 ] + pavgb mm2, [ecx+16] + movq [edi ], mm0 + movq [edi+8 ], mm1 + movd [edi+16], mm2 + dec ebp + lea ebx, [ebx+eax] + lea ecx, [ecx+edx] + lea edi, [edi+esi] + jg .height_loop + + pop edi + pop esi + pop ebx + pop ebp + ret + + + ALIGN 16 ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride, @@ -231,7 +279,7 @@ ALIGN 4 lea ebx, [ebx+eax] lea ecx, [ecx+edx] lea edi, [edi+esi] - jne .height_loop + jg .height_loop pop edi pop esi @@ -302,7 +350,7 @@ x264_pixel_avg_weight_w16_mmxext: add edi, esi add edx, ecx dec eax - jnz .height_loop + jg .height_loop BIWEIGHT_END_MMX ALIGN 16 @@ -323,7 +371,7 @@ x264_pixel_avg_weight_w8_mmxext: lea edi, [edi+esi*2] lea edx, [edx+ecx*2] sub eax, byte 2 - jnz .height_loop + jg .height_loop BIWEIGHT_END_MMX ALIGN 16 @@ -371,7 +419,7 @@ ALIGN 4 lea edi, [edi+edx*2] dec ecx dec ecx - jne .height_loop + jg .height_loop pop edi pop esi @@ -409,7 +457,7 @@ ALIGN 4 lea edi, [edi+edx*2] sub ecx, byte 4 - jnz .height_loop + jg .height_loop pop edi pop esi @@ -455,7 +503,7 @@ ALIGN 4 lea esi, [esi+ebx*2] lea edi, [edi+edx*2] sub ecx, byte 4 - jnz .height_loop + jg .height_loop pop edi pop esi @@ -488,7 +536,7 @@ ALIGN 4 dec ecx lea esi, [esi+ebx*2] lea edi, [edi+edx*2] - jnz .height_loop + jg .height_loop pop edi pop esi diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c index 2f89b105..207c0c67 100644 --- a/common/i386/mc-c.c +++ b/common/i386/mc-c.c @@ -31,6 +31,7 @@ extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +extern void x264_pixel_avg_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ); extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ); @@ -68,13 +69,14 @@ AVG_WEIGHT(8,16) AVG_WEIGHT(8,8) AVG_WEIGHT(8,4) -static void (* const x264_pixel_avg_wtab_mmxext[5])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) = +static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) = { NULL, x264_pixel_avg_w4_mmxext, x264_pixel_avg_w8_mmxext, - NULL, - x264_pixel_avg_w16_mmxext + x264_pixel_avg_w16_mmxext, + x264_pixel_avg_w16_mmxext, + x264_pixel_avg_w20_mmxext, }; static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) = { diff --git a/common/mc.h b/common/mc.h index 9c9fe517..2a022737 100644 --- a/common/mc.h +++ b/common/mc.h @@ -37,6 +37,7 @@ typedef struct int mvx, int mvy, int i_width, int i_height ); + /* may round up the dimensions if they're not a power of 2 */ uint8_t* (*get_ref)(uint8_t **, int, uint8_t *, int *, int mvx, int mvy, int i_width, int i_height ); diff --git a/common/ppc/mc.c b/common/ppc/mc.c index 2573f0e9..96245174 100644 --- a/common/ppc/mc.c +++ b/common/ppc/mc.c @@ -224,10 +224,19 @@ uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride, pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride, src2, i_src_stride, i_height ); break; + case 12: case 16: default: pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride, src2, i_src_stride, i_height ); + break; + case 20: + //FIXME suboptimal + pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride, + src2, i_src_stride, i_height ); + pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride, + src2+16, i_src_stride, i_height ); + break; } return dst; diff --git a/encoder/me.c b/encoder/me.c index 60edc794..ccffe2dc 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -586,7 +586,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite const int i_pixel = m->i_pixel; const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; - DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 ); + DECLARE_ALIGNED( uint8_t, pix[2][32*18], 16 ); // really 17x17, but round up for alignment int omx, omy; int i; @@ -610,20 +610,12 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite { int omx = bmx, omy = bmy; int costs[4]; - int stride = 16; // candidates are either all hpel or all qpel, so one stride is enough + int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough uint8_t *src0, *src1, *src2, *src3; - src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh ); - src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[2], &stride, omx-2, omy, bw, bh ); - if( (omx|omy)&1 ) - { - src1 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx, omy+2, bw, bh ); - src3 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[3], &stride, omx+2, omy, bw, bh ); - } - else - { - src1 = src0 + stride; - src3 = src2 + 1; - } + src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh+1 ); + src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx-2, omy, bw+4, bh ); + src1 = src0 + stride; + src3 = src2 + 1; h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 ); COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 ); -- 2.40.0