From b7d27eaab35a6fdffc66ffff51bd287b0f67bb3e Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Wed, 17 Sep 2008 21:25:05 -0700 Subject: [PATCH] Rewrite avg/avg_weight to take two source pointers This allows the use of get_ref instead of mc_luma almost everywhere for bipred --- common/macroblock.c | 64 ++++++++++++++++++++------------- common/mc.c | 25 +++++++------ common/mc.h | 4 +-- common/x86/mc-a.asm | 81 +++++++++++++++++++++-------------------- common/x86/mc-c.c | 47 ++++++++++++------------ encoder/analyse.c | 88 +++++++++++++++++++-------------------------- encoder/me.c | 16 +++++---- encoder/slicetype.c | 14 ++++---- tools/checkasm.c | 14 ++++---- 9 files changed, 182 insertions(+), 171 deletions(-) diff --git a/common/macroblock.c b/common/macroblock.c index 883b515e..39f50aa8 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -554,47 +554,63 @@ static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int hei static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height ) { const int i8 = x264_scan8[0]+x+8*y; - + const int i_ref0 = h->mb.cache.ref[0][i8]; const int i_ref1 = h->mb.cache.ref[1][i8]; + const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] ); + int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ); - DECLARE_ALIGNED_16( uint8_t tmp[16*16] ); - int i_mode = x264_size2pixel[height][width]; - - x264_mb_mc_0xywh( h, x, y, width, height ); - - h->mc.mc_luma( tmp, 16, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], - mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height ); - + int i_mode = x264_size2pixel[height][width]; + int i_stride0 = 16, i_stride1 = 16; + DECLARE_ALIGNED_16( uint8_t tmp0[16*16] ); + DECLARE_ALIGNED_16( uint8_t tmp1[16*16] ); + uint8_t *src0, *src1; + + src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0], + mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height ); + src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0], + mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height ); + + if( h->mb.b_interlaced & i_ref0 ) + mvy0 += (h->mb.i_mb_y & 1)*4 - 2; if( h->mb.b_interlaced & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + mvx0, mvy0, 2*width, 2*height ); + + h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + mvx0, mvy0, 2*width, 2*height ); + if( h->param.analyse.b_weighted_bipred ) { - const int i_ref0 = h->mb.cache.ref[0][i8]; const int weight = h->mb.bipred_weight[i_ref0][i_ref1]; - h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16, weight ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, + src0, i_stride0, src1, i_stride1, weight ); + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], mvx1, mvy1, 2*width, 2*height ); - h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight ); + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], mvx1, mvy1, 2*width, 2*height ); - h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight ); + h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight ); } else { - h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16 ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], + h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, + src0, i_stride0, src1, i_stride1 ); + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1], mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 ); - - h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 ); + h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2], mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 ); + h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 ); } } diff --git a/common/mc.c b/common/mc.c index 2be45cc7..9fc7a343 100644 --- a/common/mc.c +++ b/common/mc.c @@ -49,25 +49,27 @@ static inline void pixel_avg( uint8_t *dst, int i_dst_stride, } } -static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height ) +static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height ) { int x, y; for( y = 0; y < height; y++ ) { for( x = 0; x < width; x++ ) { - dst[x] = ( dst[x] + src[x] + 1 ) >> 1; + dst[x] = ( src1[x] + src2[x] + 1 ) >> 1; } + src1 += i_src1; + src2 += i_src2; dst += i_dst; - src += i_src; } } #define PIXEL_AVG_C( name, width, height ) \ static void name( uint8_t *pix1, int i_stride_pix1, \ - uint8_t *pix2, int i_stride_pix2 ) \ + uint8_t *pix2, int i_stride_pix2, \ + uint8_t *pix3, int i_stride_pix3 ) \ { \ - pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \ + pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \ } PIXEL_AVG_C( pixel_avg_16x16, 16, 16 ) PIXEL_AVG_C( pixel_avg_16x8, 16, 8 ) @@ -83,11 +85,13 @@ PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) /* Implicit weighted bipred only: * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */ -#define op_scale2(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight1 + src[x]*i_weight2 + (1<<5)) >> 6 ) -static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height, int i_weight1 ){ +#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 ) +static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height, int i_weight1 ) +{ int y; const int i_weight2 = 64 - i_weight1; - for(y=0; ycopy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2; - if( !(cpu&X264_CPU_STACK_MOD4) ) - { - pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2; - pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2; - pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2; - pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2; - pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2; - } + pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2; + pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2; + pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2; + pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2; + pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2; pf->hpel_filter = x264_hpel_filter_sse2; pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; pf->mc_chroma = x264_mc_chroma_sse2; diff --git a/encoder/analyse.c b/encoder/analyse.c index ecbaf4ec..7b941c55 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1474,21 +1474,21 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) } } -#define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \ - { \ - if( h->param.analyse.b_weighted_bipred ) \ - h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \ - h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ - else \ - h->mc.avg[size]( pix1, stride1, src2, stride2 ); \ - } +#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \ +{ \ + if( h->param.analyse.b_weighted_bipred ) \ + h->mc.avg_weight[size]( pix, stride, src1, stride1, src2, stride2, \ + h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \ + else \ + h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2 ); \ +} static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { + DECLARE_ALIGNED_16( uint8_t pix0[16*16] ); DECLARE_ALIGNED_16( uint8_t pix1[16*16] ); - DECLARE_ALIGNED_16( uint8_t pix2[16*16] ); - uint8_t *src2; - int stride2 = 16; + uint8_t *src0, *src1; + int stride0 = 16, stride1 = 16; int weight; x264_me_t m; @@ -1560,40 +1560,19 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) /* get cost of BI mode */ weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref]; - if ( (*(uint32_t*)a->l0.me16x16.mv & 0x10001) == 0 ) - { - /* l0 reference is halfpel, so get_ref on it will make it faster */ - src2 = - h->mc.get_ref( pix2, &stride2, - h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], - 16, 16 ); - h->mc.mc_luma( pix1, 16, - h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], - 16, 16 ); - weight = 64 - weight; - } - else - { - /* if l0 was qpel, we'll use get_ref on l1 instead */ - h->mc.mc_luma( pix1, 16, - h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], - a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], - 16, 16 ); - src2 = - h->mc.get_ref( pix2, &stride2, - h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], - a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], - 16, 16 ); - } + src0 = h->mc.get_ref( pix0, &stride0, + h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0], + a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 ); + src1 = h->mc.get_ref( pix1, &stride1, + h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0], + a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 ); if( h->param.analyse.b_weighted_bipred ) - h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2, weight ); + h->mc.avg_weight[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, weight ); else - h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 ); + h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1 ); - a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 ) + a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 ) + REF_COST( 0, a->l0.i_ref ) + REF_COST( 1, a->l1.i_ref ) + a->l0.me16x16.cost_mv @@ -1709,6 +1688,8 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) const int y8 = i/2; int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {8,8}; + uint8_t *src[2]; for( l = 0; l < 2; l++ ) { @@ -1727,13 +1708,12 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv ); /* BI mode */ - h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 8 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 8, 8 ); i_part_cost_bi += m->cost_mv; /* FIXME: ref cost */ } - - WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 ); + WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, src[0], stride[0], src[1], stride[1] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ) + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8]; a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8]; @@ -1759,7 +1739,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) uint8_t **p_fref[2] = { h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.p_fref[1][a->l1.i_ref] }; - DECLARE_ALIGNED_16( uint8_t pix[2][16*8] ); + DECLARE_ALIGNED_16( uint8_t pix[2][16*8] ); DECLARE_ALIGNED_4( int16_t mvc[2][2] ); int i, l; @@ -1770,6 +1750,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) { int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {16,16}; + uint8_t *src[2]; /* TODO: check only the list(s) that were used in b8x8? */ for( l = 0; l < 2; l++ ) @@ -1790,13 +1772,13 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a ) x264_me_search( h, m, mvc, 2 ); /* BI mode */ - h->mc.mc_luma( pix[l], 16, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 16, 8 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 16, 8 ); /* FIXME: ref cost */ i_part_cost_bi += m->cost_mv; } - WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 ); + WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, src[0], stride[0], src[1], stride[1] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 ); i_part_cost = a->l0.me16x8[i].cost; @@ -1839,6 +1821,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) { int i_part_cost; int i_part_cost_bi = 0; + int stride[2] = {8,8}; + uint8_t *src[2]; for( l = 0; l < 2; l++ ) { @@ -1858,13 +1842,13 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a ) x264_me_search( h, m, mvc, 2 ); /* BI mode */ - h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0], - m->mv[0], m->mv[1], 8, 16 ); + src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0], + m->mv[0], m->mv[1], 8, 16 ); /* FIXME: ref cost */ i_part_cost_bi += m->cost_mv; } - WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 ); + WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, src[0], stride[0], src[1], stride[1] ); i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 ); i_part_cost = a->l0.me8x16[i].cost; diff --git a/encoder/me.c b/encoder/me.c index 63c57863..8892e340 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -787,8 +787,10 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define BIME_CACHE( dx, dy ) \ { \ int i = 4 + 3*dx + dy; \ - h->mc.mc_luma( pix0[i], bw, m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \ - h->mc.mc_luma( pix1[i], bw, m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \ + stride0[i] = bw;\ + stride1[i] = bw;\ + src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \ + src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \ } #define BIME_CACHE2(a,b) \ @@ -802,11 +804,10 @@ if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \ int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \ int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \ visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\ - h->mc.memcpy_aligned( pix, pix0[i0], bs ); \ if( i_weight == 32 ) \ - h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \ + h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1] ); \ else \ - h->mc.avg_weight[i_pixel]( pix, bw, pix1[i1], bw, i_weight ); \ + h->mc.avg_weight[i_pixel]( pix, bw, src1[i1], stride1[i1], src0[i0], stride0[i0], i_weight ); \ cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \ + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \ + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \ @@ -838,7 +839,6 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - const int bs = bw*bh; const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); @@ -846,6 +846,10 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] ); DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] ); DECLARE_ALIGNED_16( uint8_t pix[16*16] ); + uint8_t *src0[9]; + uint8_t *src1[9]; + int stride0[9]; + int stride1[9]; int bm0x = m0->mv[0], om0x = bm0x; int bm0y = m0->mv[1], om0y = bm0y; int bm1x = m1->mv[0], om1x = bm1x; diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 04034f19..ed10698a 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -95,17 +95,17 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, } #define TRY_BIDIR( mv0, mv1, penalty ) \ { \ - int stride2 = 16; \ - uint8_t *src2; \ + int stride1 = 16, stride2 = 16; \ + uint8_t *src1, *src2; \ int i_cost; \ - h->mc.mc_luma( pix1, 16, m[0].p_fref, m[0].i_stride[0], \ - (mv0)[0], (mv0)[1], 8, 8 ); \ + src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \ + (mv0)[0], (mv0)[1], 8, 8 ); \ src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \ - (mv1)[0], (mv1)[1], 8, 8 ); \ + (mv1)[0], (mv1)[1], 8, 8 ); \ if( i_bipred_weight != 32 ) \ - h->mc.avg_weight[PIXEL_8x8]( pix1, 16, src2, stride2, i_bipred_weight ); \ + h->mc.avg_weight[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \ else \ - h->mc.avg[PIXEL_8x8]( pix1, 16, src2, stride2 ); \ + h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2 ); \ i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \ m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \ if( i_bcost > i_cost ) \ diff --git a/tools/checkasm.c b/tools/checkasm.c index 82779bde..37f2b074 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -774,27 +774,27 @@ static int check_mc( int cpu_ref, int cpu_new ) #define MC_TEST_AVG( name, ... ) \ for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \ { \ - memcpy( buf3, buf1, 1024 ); \ - memcpy( buf4, buf1, 1024 ); \ + memcpy( buf2, buf1, 1024 ); \ + memcpy( buf4, buf3, 1024 ); \ if( mc_a.name[i] != mc_ref.name[i] ) \ { \ set_func_name( "%s_%s", #name, pixel_names[i] );\ used_asm = 1; \ - call_c1( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \ - call_a1( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \ + call_c1( mc_c.name[i], buf3, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \ + call_a1( mc_a.name[i], buf4, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \ if( memcmp( buf3, buf4, 1024 ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \ } \ - call_c2( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \ - call_a2( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \ + call_c2( mc_c.name[i], buf3, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \ + call_a2( mc_a.name[i], buf4, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \ } \ } MC_TEST_AVG( avg ); report( "mc avg :" ); ok = 1; used_asm = 0; - for( w = -64; w <= 128 && ok; w++ ) + for( w = 32; w <= 32 && ok; w++ ) MC_TEST_AVG( avg_weight, w ); report( "mc wpredb :" ); -- 2.40.0