;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
-; uint8_t *dst, int i_dst_stride,
+; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride,
; int dx, int dy,
; int i_width, int i_height )
;-----------------------------------------------------------------------------
mov r11d, parm5d
sar r10d, 3
sar r11d, 3
- imul r10d, parm2d
+ imul r10d, parm4d
pxor mm3, mm3
add r10d, r11d
movsxd r10, r10d
mov r11d, parm8d
- add parm1q, r10 ; src += (dx>>3) + (dy>>3) * src_stride
+ add parm3q, r10 ; src += (dx>>3) + (dy>>3) * src_stride
and parm5d, 7 ; dx &= 7
je .mc1d
and parm6d, 7 ; dy &= 7
pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
- mov rax, parm1q
- mov r10, parm3q
+ mov rax, parm3q
+ mov r10, parm1q
ALIGN 4
.height_loop
- movd mm1, [rax+parm2q]
+ movd mm1, [rax+parm4q]
movd mm0, [rax]
punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
punpcklbw mm0, mm3
paddw mm0, mm1 ; mm0 <- result
movd mm2, [rax+1]
- movd mm1, [rax+parm2q+1]
+ movd mm1, [rax+parm4q+1]
punpcklbw mm2, mm3
punpcklbw mm1, mm3
packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4
movd [r10], mm0
- add rax, parm2q
- add r10, parm4q ; i_dst_stride
+ add rax, parm4q
+ add r10, parm2q ; i_dst_stride
dec r11d
jnz .height_loop
sub parm7d, 8
jnz .finish ; width != 8 so assume 4
- mov r10, parm3q ; dst
- mov rax, parm1q ; src
+ mov r10, parm1q ; dst
+ mov rax, parm3q ; src
mov r11d, parm8d ; i_height
add r10, 4
add rax, 4
ALIGN 4
.mc1d
-%ifdef WIN64
-%define pel_offset rsi
-%else
-%define pel_offset r9
-%endif
+%define pel_offset temp1q
mov eax, parm5d
or eax, parm6d
and eax, 7
cmp parm5d, 0
mov pel_offset, 1
- cmove pel_offset, parm2q ; pel_offset = dx ? 1 : src_stride
+ cmove pel_offset, parm4q ; pel_offset = dx ? 1 : src_stride
movd mm6, eax
movq mm5, [pw_8 GLOBAL]
pshufw mm6, mm6, 0
ALIGN 4
.height_loop1_w4
- movd mm0, [parm1q+pel_offset]
- movd mm1, [parm1q]
+ movd mm0, [parm3q+pel_offset]
+ movd mm1, [parm3q]
punpcklbw mm0, mm3
punpcklbw mm1, mm3
pmullw mm0, mm6
paddw mm0, mm1
psrlw mm0, 3
packuswb mm0, mm3
- movd [parm3q], mm0
- add parm1q, parm2q
+ movd [parm1q], mm0
add parm3q, parm4q
+ add parm1q, parm2q
dec r11d
jnz .height_loop1_w4
rep ret
ALIGN 4
.height_loop1_w8
- movq mm0, [parm1q+pel_offset]
- movq mm1, [parm1q]
+ movq mm0, [parm3q+pel_offset]
+ movq mm1, [parm3q]
movq mm2, mm0
movq mm4, mm1
punpcklbw mm0, mm3
psrlw mm0, 3
psrlw mm2, 3
packuswb mm0, mm2
- movq [parm3q], mm0
- add parm1q, parm2q
+ movq [parm1q], mm0
add parm3q, parm4q
+ add parm1q, parm2q
dec r11d
jnz .height_loop1_w8
rep ret
;=============================================================================
;-----------------------------------------------------------------------------
-; void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
-; uint8_t *dst, int i_dst_stride,
+; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride,
; int dx, int dy,
; int i_width, int i_height )
;-----------------------------------------------------------------------------
mov edi, edx
sar ecx, 3
sar edx, 3
- imul ecx, [picesp+4+8]
+ imul ecx, [picesp+4+16]
add ecx, edx
- add [picesp+4+4], ecx ; src += (dx>>3) + (dy>>3) * src_stride
+ add [picesp+4+12], ecx ; src += (dx>>3) + (dy>>3) * src_stride
pxor mm3, mm3
pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC
pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA
- mov eax, [picesp+4+4] ; src
- mov edi, [picesp+4+12] ; dst
- mov ecx, [picesp+4+8] ; i_src_stride
+ mov eax, [picesp+4+12] ; src
+ mov edi, [picesp+4+4] ; dst
+ mov ecx, [picesp+4+16] ; i_src_stride
mov edx, [picesp+4+32] ; i_height
ALIGN 4
movd [edi], mm0
add eax, ecx
- add edi, [picesp+4+16]
+ add edi, [picesp+4+8]
dec edx
jnz .height_loop
sub [picesp+4+28], dword 8
jnz .finish ; width != 8 so assume 4
- mov edi, [picesp+4+12] ; dst
- mov eax, [picesp+4+4] ; src
+ mov edi, [picesp+4+4] ; dst
+ mov eax, [picesp+4+12] ; src
mov edx, [picesp+4+32] ; i_height
add edi, 4
add eax, 4
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
- uint8_t *dst, int i_dst_stride,
+void mc_luma_mmx( uint8_t *dst, int i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
}
}
-uint8_t *get_ref_mmx( uint8_t *src[4], int i_src_stride,
- uint8_t *dst, int *i_dst_stride,
+uint8_t *get_ref_mmx( uint8_t *dst, int *i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
const int mvx = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
int mvy = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
- h->mc.mc_luma( h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
- &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
- mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+ h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
+ h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0],
+ mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
// chroma is offset if MCing from a field of opposite parity
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- mvx, mvy, 2*width, 2*height );
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->mb.pic.p_fref[0][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ mvx, mvy, 2*width, 2*height );
- h->mc.mc_chroma( &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- mvx, mvy, 2*width, 2*height );
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->mb.pic.p_fref[0][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ mvx, mvy, 2*width, 2*height );
}
static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
{
const int mvx = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
int mvy = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
- h->mc.mc_luma( h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
- &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
- mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
+ h->mc.mc_luma( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
+ h->mb.pic.p_fref[1][i_ref], h->mb.pic.i_stride[0],
+ mvx + 4*4*x, mvy + 4*4*y, 4*width, 4*height );
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- mvx, mvy, 2*width, 2*height );
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->mb.pic.p_fref[1][i_ref][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ mvx, mvy, 2*width, 2*height );
- h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- mvx, mvy, 2*width, 2*height );
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ &h->mb.pic.p_fref[1][i_ref][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ mvx, mvy, 2*width, 2*height );
}
static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
x264_mb_mc_0xywh( h, x, y, width, height );
- h->mc.mc_luma( h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
- tmp, 16, mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+ h->mc.mc_luma( tmp, 16, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
+ mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16, weight );
- h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- tmp, 16, mvx1, mvy1, 2*width, 2*height );
+ h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
- h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- tmp, 16, mvx1, mvy1, 2*width, 2*height );
+ h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
}
else
{
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16 );
- h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
- tmp, 16, mvx1, mvy1, 2*width, 2*height );
+ h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+ mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
- h->mc.mc_chroma( &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
- tmp, 16, mvx1, mvy1, 2*width, 2*height );
+ h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+ mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
}
}
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-static void mc_luma( uint8_t *src[4], int i_src_stride,
- uint8_t *dst, int i_dst_stride,
+static void mc_luma( uint8_t *dst, int i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
}
}
-static uint8_t *get_ref( uint8_t *src[4], int i_src_stride,
- uint8_t *dst, int *i_dst_stride,
+static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
}
/* full chroma mc (ie until 1/8 pixel)*/
-static void motion_compensation_chroma( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int mvx, int mvy,
- int i_width, int i_height )
+static void mc_chroma( uint8_t *dst, int i_dst_stride,
+ uint8_t *src, int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height )
{
uint8_t *srcp;
int x, y;
{
pf->mc_luma = mc_luma;
pf->get_ref = get_ref;
- pf->mc_chroma = motion_compensation_chroma;
+ pf->mc_chroma = mc_chroma;
pf->avg[PIXEL_16x16]= pixel_avg_16x16;
pf->avg[PIXEL_16x8] = pixel_avg_16x8;
typedef struct
{
- void (*mc_luma)(uint8_t **, int, uint8_t *, int,
+ void (*mc_luma)(uint8_t *dst, int i_dst, uint8_t **src, int i_src,
int mvx, int mvy,
int i_width, int i_height );
/* may round up the dimensions if they're not a power of 2 */
- uint8_t* (*get_ref)(uint8_t **, int, uint8_t *, int *,
+ uint8_t* (*get_ref)(uint8_t *dst, int *i_dst, uint8_t **src, int i_src,
int mvx, int mvy,
int i_width, int i_height );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
- void (*mc_chroma)(uint8_t *, int, uint8_t *, int,
+ void (*mc_chroma)(uint8_t *dst, int i_dst, uint8_t *src, int i_src,
int mvx, int mvy,
int i_width, int i_height );
MC_COPY( mc_copy_w8, 8 )
MC_COPY( mc_copy_w16, 16 )
-void mc_luma_altivec( uint8_t *src[4], int i_src_stride,
- uint8_t *dst, int i_dst_stride,
+void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
}
}
-uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
- uint8_t *dst, int * i_dst_stride,
+uint8_t *get_ref_altivec( uint8_t *dst, int * i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
src##a##v_16 = vec_mladd( coeff##a##v, src##a##v_16, zero_u16v ); \
dstv_16 = vec_add( dstv_16, src##a##v_16 )
-static void mc_chroma_altivec_4xh( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride,
+ uint8_t *src, int i_src_stride,
int mvx, int mvy,
int i_height )
{
}
}
-static void mc_chroma_altivec_8xh( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride,
+ uint8_t *src, int i_src_stride,
int mvx, int mvy,
int i_height )
{
}
}
-static void mc_chroma_altivec( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
+static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride,
+ uint8_t *src, int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
if( i_width == 8 )
{
- mc_chroma_altivec_8xh( src, i_src_stride, dst, i_dst_stride,
+ mc_chroma_altivec_8xh( dst, i_dst_stride, src, i_src_stride,
mvx, mvy, i_height );
}
else
{
- mc_chroma_altivec_4xh( src, i_src_stride, dst, i_dst_stride,
+ mc_chroma_altivec_4xh( dst, i_dst_stride, src, i_src_stride,
mvx, mvy, i_height );
}
}
const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
#define CHROMA4x4MC( width, height, me, x, y ) \
- h->mc.mc_chroma( &p_fref[4][or+x+y*i_stride], i_stride, &pix1[x+y*16], 16, (me).mv[0], (me).mv[1], width, height ); \
- h->mc.mc_chroma( &p_fref[5][or+x+y*i_stride], i_stride, &pix2[x+y*16], 16, (me).mv[0], (me).mv[1], width, height );
+ h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height ); \
+ h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1], width, height );
if( pixel == PIXEL_4x4 )
{
if ( ((a->l0.me16x16.mv[0] | a->l0.me16x16.mv[1]) & 1) == 0 )
{
/* l0 reference is halfpel, so get_ref on it will make it faster */
- src2 = h->mc.get_ref( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
- pix2, &stride2,
- a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
- 16, 16 );
- h->mc.mc_luma( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
- pix1, 16,
- a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
- 16, 16 );
+ src2 =
+ h->mc.get_ref( pix2, &stride2,
+ h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
+ a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
+ 16, 16 );
+ h->mc.mc_luma( pix1, 16,
+ h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
+ a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
+ 16, 16 );
weight = 64 - weight;
}
else
{
/* if l0 was qpel, we'll use get_ref on l1 instead */
- h->mc.mc_luma( h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
- pix1, 16,
- a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
- 16, 16 );
- src2 = h->mc.get_ref( h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
- pix2, &stride2,
- a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
- 16, 16 );
+ h->mc.mc_luma( pix1, 16,
+ h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
+ a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
+ 16, 16 );
+ src2 =
+ h->mc.get_ref( pix2, &stride2,
+ h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
+ a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
+ 16, 16 );
}
if( h->param.analyse.b_weighted_bipred )
x264_macroblock_cache_mv( h, 2*x8, 2*y8, 2, 2, l, m->mv[0], m->mv[1] );
/* BI mode */
- h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
- m->mv[0], m->mv[1], 8, 8 );
+ h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
+ m->mv[0], m->mv[1], 8, 8 );
i_part_cost_bi += m->cost_mv;
/* FIXME: ref cost */
}
x264_me_search( h, m, mvc, 2 );
/* BI mode */
- h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 16,
- m->mv[0], m->mv[1], 16, 8 );
+ h->mc.mc_luma( pix[l], 16, m->p_fref, m->i_stride[0],
+ m->mv[0], m->mv[1], 16, 8 );
/* FIXME: ref cost */
i_part_cost_bi += m->cost_mv;
}
x264_me_search( h, m, mvc, 2 );
/* BI mode */
- h->mc.mc_luma( m->p_fref, m->i_stride[0], pix[l], 8,
- m->mv[0], m->mv[1], 8, 16 );
+ h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
+ m->mv[0], m->mv[1], 8, 16 );
/* FIXME: ref cost */
i_part_cost_bi += m->cost_mv;
}
h->mb.mv_min[1], h->mb.mv_max[1] );
/* Motion compensation XXX probably unneeded */
- h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
- h->mb.pic.p_fdec[0], FDEC_STRIDE,
+ h->mc.mc_luma( h->mb.pic.p_fdec[0], FDEC_STRIDE,
+ h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
mvx, mvy, 16, 16 );
/* Chroma MC */
- h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
- h->mb.pic.p_fdec[1], FDEC_STRIDE,
+ h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE,
+ h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
mvx, mvy, 8, 8 );
- h->mc.mc_chroma( h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
- h->mb.pic.p_fdec[2], FDEC_STRIDE,
+ h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE,
+ h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
mvx, mvy, 8, 8 );
x264_macroblock_encode_skip( h );
mvp[1] = x264_clip3( h->mb.cache.pskip_mv[1], h->mb.mv_min[1], h->mb.mv_max[1] );
/* Motion compensation */
- h->mc.mc_luma( h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
- h->mb.pic.p_fdec[0], FDEC_STRIDE,
+ h->mc.mc_luma( h->mb.pic.p_fdec[0], FDEC_STRIDE,
+ h->mb.pic.p_fref[0][0], h->mb.pic.i_stride[0],
mvp[0], mvp[1], 16, 16 );
}
if( !b_bidir )
{
- h->mc.mc_chroma( h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
- h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+ h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+ h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
mvp[0], mvp[1], 8, 8 );
}
#define COST_MV_HPEL( mx, my ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
+ uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
#define COST_MV_SAD( mx, my ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, mx, my, bw, bh ); \
+ uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
if( b_refine_qpel || (dir^1) != odir ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, mx, my, bw, bh ); \
+ uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
if( b_chroma_me && cost < bcost ) \
{ \
- h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix[0], 8, mx, my, bw/2, bh/2 ); \
+ h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \
cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \
if( cost < bcost ) \
{ \
- h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix[0], 8, mx, my, bw/2, bh/2 ); \
+ h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \
cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \
} \
} \
int costs[4];
int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
uint8_t *src0, *src1, *src2, *src3;
- src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh+1 );
- src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx-2, omy, bw+4, bh );
+ src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 );
+ src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
src1 = src0 + stride;
src3 = src2 + 1;
h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
#define BIME_CACHE( dx, dy ) \
{ \
int i = 4 + 3*dx + dy; \
- h->mc.mc_luma( m0->p_fref, m0->i_stride[0], pix0[i], bw, om0x+dx, om0y+dy, bw, bh ); \
- h->mc.mc_luma( m1->p_fref, m1->i_stride[0], pix1[i], bw, om1x+dx, om1y+dy, bw, bh ); \
+ h->mc.mc_luma( pix0[i], bw, m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
+ h->mc.mc_luma( pix1[i], bw, m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
}
#define BIME_CACHE2(a,b) \
#define COST_MV_SATD( mx, my, dst ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw*4, bh*4 ); \
+ uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \
dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[mx] + p_cost_mvy[my]; \
COPY1_IF_LT( bsatd, dst ); \
int stride2 = 16; \
uint8_t *src2; \
int i_cost; \
- h->mc.mc_luma( m[0].p_fref, m[0].i_stride[0], pix1, 16, \
+ h->mc.mc_luma( pix1, 16, m[0].p_fref, m[0].i_stride[0], \
(mv0)[0], (mv0)[1], 8, 8 ); \
- src2 = h->mc.get_ref( m[1].p_fref, m[1].i_stride[0], pix2, &stride2, \
+ src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
(mv1)[0], (mv1)[1], 8, 8 ); \
h->mc.avg[PIXEL_8x8]( pix1, 16, src2, stride2 ); \
i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
x264_mc_functions_t mc_c;
x264_mc_functions_t mc_ref;
x264_mc_functions_t mc_a;
+ x264_pixel_function_t pixel;
uint8_t *src = &buf1[2*32+2];
uint8_t *src2[4] = { &buf1[2*32+2], &buf1[7*32+2],
x264_mc_init( 0, &mc_c );
x264_mc_init( cpu_ref, &mc_ref );
x264_mc_init( cpu_new, &mc_a );
+ x264_pixel_init( 0, &pixel );
#define MC_TEST_LUMA( w, h ) \
if( mc_a.mc_luma != mc_ref.mc_luma ) \
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- mc_c.mc_luma( src2, 32, dst1, 16, dx, dy, w, h ); \
- mc_a.mc_luma( src2, 32, dst2, 16, dx, dy, w, h ); \
- if( memcmp( buf3, buf4, 1024 ) ) \
+ mc_c.mc_luma( dst1, 16, src2, 32, dx, dy, w, h ); \
+ mc_a.mc_luma( dst2, 16, src2, 32, dx, dy, w, h ); \
+ if( memcmp( buf3, buf4, 1024 ) ) \
{ \
- fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
+ fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
+ ok = 0; \
+ } \
+ } \
+ if( mc_a.get_ref != mc_ref.get_ref ) \
+ { \
+ uint8_t *ref = dst2; \
+ int ref_stride = 16; \
+ used_asm = 1; \
+ memset(buf3, 0xCD, 1024); \
+ memset(buf4, 0xCD, 1024); \
+ mc_c.mc_luma( dst1, 16, src2, 32, dx, dy, w, h ); \
+ ref = mc_a.get_ref( ref, &ref_stride, src2, 32, dx, dy, w, h ); \
+ if( pixel.sad[PIXEL_##w##x##h]( dst1, 16, ref, ref_stride ) ) \
+ { \
+ fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
ok = 0; \
} \
}
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- mc_c.mc_chroma( src, 32, dst1, 16, dx, dy, w, h ); \
- mc_a.mc_chroma( src, 32, dst2, 16, dx, dy, w, h ); \
+ mc_c.mc_chroma( dst1, 16, src, 32, dx, dy, w, h ); \
+ mc_a.mc_chroma( dst2, 16, src, 32, dx, dy, w, h ); \
/* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
for( j=0; j<h; j++ ) \
for( i=w; i<4; i++ ) \
dst2[i+j*16] = dst1[i+j*16]; \
- if( memcmp( buf3, buf4, 1024 ) ) \
+ if( memcmp( buf3, buf4, 1024 ) ) \
{ \
- fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
+ fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
ok = 0; \
} \
}
ok = 1; used_asm = 0;
- for( dy = 0; dy < 4; dy++ )
- for( dx = 0; dx < 4; dx++ )
+ for( dy = -8; dy < 8; dy++ )
+ for( dx = -8; dx < 8; dx++ )
{
MC_TEST_LUMA( 16, 16 );
MC_TEST_LUMA( 16, 8 );
}
MC_TEST_AVG( avg );
report( "mc avg :" );
+ ok = 1; used_asm = 0;
for( w = -64; w <= 128 && ok; w++ )
MC_TEST_AVG( avg_weight, w );
report( "mc wpredb :" );