%define parm7d dword parm7q
%define parm8d dword parm8q
+%define temp1q rdi
+%define temp2q rsi
+%define temp1d edi
+%define temp2d esi
+
%macro firstpush 1
db 0x48
push %1
%define parm7d dword parm7q
%define parm8d dword parm8q
+%define temp1q r9
+%define temp2q r8
+%define temp1d r9d
+%define temp2d r8d
+
%macro allocstack 1
%endmacro
cglobal x264_pixel_avg_w4_mmxext
cglobal x264_pixel_avg_w8_mmxext
cglobal x264_pixel_avg_w16_mmxext
+cglobal x264_pixel_avg_w20_mmxext
cglobal x264_pixel_avg_w16_sse2
cglobal x264_pixel_avg_weight_4x4_mmxext
lea parm3q, [parm3q+parm4q*2]
lea r10, [r10+r11*2]
lea parm1q, [parm1q+parm2q*2]
- jne .height_loop
+ jg .height_loop
rep ret
lea parm3q, [parm3q+parm4q*2]
lea r10, [r10+r11*2]
lea parm1q, [parm1q+parm2q*2]
- jne .height_loop
+ jg .height_loop
rep ret
ALIGN 16
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
- jne .height_loop
+ jg .height_loop
+ rep ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride,
+; uint8_t *src1, int i_src1_stride,
+; uint8_t *src2, int i_src2_stride,
+; int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w20_mmxext:
+ mov r10, parm5q ; src2
+ movsxd r11, parm6d ; i_src2_stride
+ mov eax, parm7d ; i_height
+
+ALIGN 4
+.height_loop
+ movq mm0, [parm3q ]
+ movq mm1, [parm3q+8 ]
+ movd mm2, [parm3q+16]
+ pavgb mm0, [r10 ]
+ pavgb mm1, [r10+8 ]
+ pavgb mm2, [r10+16]
+ movq [parm1q ], mm0
+ movq [parm1q+8 ], mm1
+ movd [parm1q+16], mm2
+ dec eax
+ lea parm3q, [parm3q+parm4q]
+ lea r10, [r10+r11]
+ lea parm1q, [parm1q+parm2q]
+ jg .height_loop
rep ret
ALIGN 16
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
- jne .height_loop
+ jg .height_loop
rep ret
add parm1q, parm2q
add parm3q, parm4q
dec r11d
- jnz .height_loop
+ jg .height_loop
rep ret
ALIGN 16
add parm1q, parm2q
add parm3q, parm4q
dec r11d
- jnz .height_loop
+ jg .height_loop
rep ret
ALIGN 16
lea parm1q, [parm1q+parm2q*2]
dec eax
dec eax
- jne .height_loop
+ jg .height_loop
rep ret
ALIGN 16
lea parm1q, [parm1q+parm2q*4]
sub eax, byte 4
- jnz .height_loop
+ jg .height_loop
rep ret
ALIGN 16
lea parm3q, [parm3q+parm4q*4]
lea parm1q, [parm1q+parm2q*4]
sub eax, byte 4
- jnz .height_loop
+ jg .height_loop
rep ret
sub eax, byte 2
lea parm3q, [parm3q+parm4q*2]
lea parm1q, [parm1q+parm2q*2]
- jnz .height_loop
+ jg .height_loop
rep ret
cglobal x264_pixel_avg_w4_mmxext
cglobal x264_pixel_avg_w8_mmxext
cglobal x264_pixel_avg_w16_mmxext
+cglobal x264_pixel_avg_w20_mmxext
cglobal x264_pixel_avg_w16_sse2
cglobal x264_pixel_avg_weight_4x4_mmxext
lea ebx, [ebx+eax*2]
lea ecx, [ecx+edx*2]
lea edi, [edi+esi*2]
- jne .height_loop
+ jg .height_loop
pop edi
pop esi
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
- jne .height_loop
+ jg .height_loop
pop edi
pop esi
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
- jne .height_loop
+ jg .height_loop
pop edi
pop esi
pop ebp
ret
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride,
+; uint8_t *src1, int i_src1_stride,
+; uint8_t *src2, int i_src2_stride,
+; int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w20_mmxext:
+ push ebp
+ push ebx
+ push esi
+ push edi
+
+ mov edi, [esp+20] ; dst
+ mov ebx, [esp+28] ; src1
+ mov ecx, [esp+36] ; src2
+ mov esi, [esp+24] ; i_dst_stride
+ mov eax, [esp+32] ; i_src1_stride
+ mov edx, [esp+40] ; i_src2_stride
+ mov ebp, [esp+44] ; i_height
+ALIGN 4
+.height_loop
+ movq mm0, [ebx ]
+ movq mm1, [ebx+8 ]
+ movd mm2, [ebx+16]
+ pavgb mm0, [ecx ]
+ pavgb mm1, [ecx+8 ]
+ pavgb mm2, [ecx+16]
+ movq [edi ], mm0
+ movq [edi+8 ], mm1
+ movd [edi+16], mm2
+ dec ebp
+ lea ebx, [ebx+eax]
+ lea ecx, [ecx+edx]
+ lea edi, [edi+esi]
+ jg .height_loop
+
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+
+
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride,
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
- jne .height_loop
+ jg .height_loop
pop edi
pop esi
add edi, esi
add edx, ecx
dec eax
- jnz .height_loop
+ jg .height_loop
BIWEIGHT_END_MMX
ALIGN 16
lea edi, [edi+esi*2]
lea edx, [edx+ecx*2]
sub eax, byte 2
- jnz .height_loop
+ jg .height_loop
BIWEIGHT_END_MMX
ALIGN 16
lea edi, [edi+edx*2]
dec ecx
dec ecx
- jne .height_loop
+ jg .height_loop
pop edi
pop esi
lea edi, [edi+edx*2]
sub ecx, byte 4
- jnz .height_loop
+ jg .height_loop
pop edi
pop esi
lea esi, [esi+ebx*2]
lea edi, [edi+edx*2]
sub ecx, byte 4
- jnz .height_loop
+ jg .height_loop
pop edi
pop esi
dec ecx
lea esi, [esi+ebx*2]
lea edi, [edi+edx*2]
- jnz .height_loop
+ jg .height_loop
pop edi
pop esi
extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
AVG_WEIGHT(8,8)
AVG_WEIGHT(8,4)
-static void (* const x264_pixel_avg_wtab_mmxext[5])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) =
+static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) =
{
NULL,
x264_pixel_avg_w4_mmxext,
x264_pixel_avg_w8_mmxext,
- NULL,
- x264_pixel_avg_w16_mmxext
+ x264_pixel_avg_w16_mmxext,
+ x264_pixel_avg_w16_mmxext,
+ x264_pixel_avg_w20_mmxext,
};
static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =
{
int mvx, int mvy,
int i_width, int i_height );
+ /* may round up the dimensions if they're not a power of 2 */
uint8_t* (*get_ref)(uint8_t **, int, uint8_t *, int *,
int mvx, int mvy,
int i_width, int i_height );
pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
break;
+ case 12:
case 16:
default:
pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
+ break;
+ case 20:
+ //FIXME suboptimal
+ pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
+ src2, i_src_stride, i_height );
+ pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride,
+ src2+16, i_src_stride, i_height );
+ break;
}
return dst;
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
- DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
+ DECLARE_ALIGNED( uint8_t, pix[2][32*18], 16 ); // really 17x17, but round up for alignment
int omx, omy;
int i;
{
int omx = bmx, omy = bmy;
int costs[4];
- int stride = 16; // candidates are either all hpel or all qpel, so one stride is enough
+ int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
uint8_t *src0, *src1, *src2, *src3;
- src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh );
- src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[2], &stride, omx-2, omy, bw, bh );
- if( (omx|omy)&1 )
- {
- src1 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx, omy+2, bw, bh );
- src3 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[3], &stride, omx+2, omy, bw, bh );
- }
- else
- {
- src1 = src0 + stride;
- src3 = src2 + 1;
- }
+ src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh+1 );
+ src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx-2, omy, bw+4, bh );
+ src1 = src0 + stride;
+ src3 = src2 + 1;
h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 );
COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 );