SAD_X( 8x8_vis )
#endif
+/****************************************************************************
+ * pixel_satd_x4
+ * no faster than single satd, but needed for satd to be a drop-in replacement for sad
+ ****************************************************************************/
+
+#define SATD_X( size, cpu ) \
+static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+{\
+ scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
+ scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
+ scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
+}\
+static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+{\
+ scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
+ scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
+ scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
+ scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
+}
+#define SATD_X_DECL5( cpu )\
+SATD_X( 16x16, cpu )\
+SATD_X( 16x8, cpu )\
+SATD_X( 8x16, cpu )\
+SATD_X( 8x8, cpu )\
+SATD_X( 8x4, cpu )
+#define SATD_X_DECL7( cpu )\
+SATD_X_DECL5( cpu )\
+SATD_X( 4x8, cpu )\
+SATD_X( 4x4, cpu )
+
+SATD_X_DECL7()
+#ifdef HAVE_MMX
+SATD_X_DECL7( _mmxext )
+SATD_X_DECL5( _sse2 )
+#ifdef HAVE_SSE3
+SATD_X_DECL5( _ssse3 )
+#endif
+#endif
+
/****************************************************************************
* structural similarity metric
****************************************************************************/
INIT7( sad_x4, );
INIT7( ssd, );
INIT7( satd, );
+ INIT7( satd_x3, );
+ INIT7( satd_x4, );
INIT4( sa8d, );
INIT_ADS( );
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
+ INIT7( satd_x3, _mmxext );
+ INIT7( satd_x4, _mmxext );
INIT_ADS( _mmxext );
#ifdef ARCH_X86
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT5( satd, _sse2 );
+ INIT5( satd_x3, _sse2 );
+ INIT5( satd_x4, _sse2 );
INIT_ADS( _sse2 );
#ifdef ARCH_X86
if( cpu&X264_CPU_SSSE3 )
{
INIT5( satd, _ssse3 );
+ INIT5( satd_x3, _ssse3 );
+ INIT5( satd_x4, _ssse3 );
INIT_ADS( _ssse3 );
#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
if( h->param.b_interlaced )
{
- if( h->param.analyse.i_me_method == X264_ME_ESA )
+ if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
x264_log( h, X264_LOG_WARNING, "interlace + me=esa is not implemented\n" );
h->param.analyse.i_me_method = X264_ME_UMH;
h->param.i_cqm_preset = X264_CQM_FLAT;
if( h->param.analyse.i_me_method < X264_ME_DIA ||
- h->param.analyse.i_me_method > X264_ME_ESA )
+ h->param.analyse.i_me_method > X264_ME_TESA )
h->param.analyse.i_me_method = X264_ME_HEX;
if( h->param.analyse.i_me_range < 4 )
h->param.analyse.i_me_range = 4;
if( h->param.analyse.i_me_range > 16 && h->param.analyse.i_me_method <= X264_ME_HEX )
h->param.analyse.i_me_range = 16;
+ if( h->param.analyse.i_me_method == X264_ME_TESA &&
+ (h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1) )
+ h->param.analyse.i_me_method = X264_ME_ESA;
h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 7 );
h->param.analyse.b_bframe_rdo = h->param.analyse.b_bframe_rdo && h->param.analyse.i_subpel_refine >= 6;
h->param.analyse.b_mixed_references = h->param.analyse.b_mixed_references && h->param.i_frame_reference > 1;
static void mbcmp_init( x264_t *h )
{
- memcpy( h->pixf.mbcmp,
- ( h->mb.b_lossless || h->param.analyse.i_subpel_refine <= 1 ) ? h->pixf.sad : h->pixf.satd,
- sizeof(h->pixf.mbcmp) );
+ int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
+ memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp) );
+ satd &= h->param.analyse.i_me_method == X264_ME_TESA;
+ memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
+ memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
+ memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
}
/****************************************************************************
#define COST_MV( mx, my )\
{\
- int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
+ int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE,\
&p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] )\
+ BITS_MVD(mx,my);\
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\
{ \
int stride = 16; \
uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
- int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
}
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
uint8_t *pix_base = p_fref + bmx + bmy*m->i_stride[0];\
- h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
+ h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
pix_base + (m0x) + (m0y)*m->i_stride[0],\
pix_base + (m1x) + (m1y)*m->i_stride[0],\
pix_base + (m2x) + (m2y)*m->i_stride[0],\
#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
uint8_t *pix_base = p_fref + omx + omy*m->i_stride[0];\
- h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
+ h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0],\
pix_base + (m0x) + (m0y)*m->i_stride[0],\
pix_base + (m1x) + (m1y)*m->i_stride[0],\
pix_base + (m2x) + (m2y)*m->i_stride[0],\
#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
{\
- h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
+ h->pixf.fpelcmp_x3[i_pixel]( m->p_fenc[0],\
p_fref + (m0x) + (m0y)*m->i_stride[0],\
p_fref + (m1x) + (m1y)*m->i_stride[0],\
p_fref + (m2x) + (m2y)*m->i_stride[0],\
}
case X264_ME_ESA:
+ case X264_ME_TESA:
{
const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
enc_dc[1] = enc_dc[2];
- for( my = min_y; my <= max_y; my++ )
+ if( h->mb.i_me_method == X264_ME_TESA )
+ {
+ // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
+ typedef struct {
+ int sad;
+ int16_t mx, my;
+ } mvsad_t;
+ mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
+ int nmvsad = 0, limit;
+ int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
+ int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
+ + BITS_MVD( bmx, bmy );
+ for( my = min_y; my <= max_y; my++ )
+ {
+ int ycost = p_cost_mvy[my<<2];
+ bsad -= ycost;
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+ cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
+ for( i=0; i<xn-2; i+=3 )
+ {
+ uint8_t *ref = p_fref+min_x+my*stride;
+ int sads[3];
+ h->pixf.sad_x3[i_pixel]( m->p_fenc[0], ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
+ for( j=0; j<3; j++ )
+ {
+ int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
+ if( sad < bsad*sad_thresh>>3 )
+ {
+ COPY1_IF_LT( bsad, sad );
+ mvsads[nmvsad].sad = sad + ycost;
+ mvsads[nmvsad].mx = min_x+xs[i+j];
+ mvsads[nmvsad].my = my;
+ nmvsad++;
+ }
+ }
+ }
+ for( ; i<xn; i++ )
+ {
+ int mx = min_x+xs[i];
+ int sad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+mx+my*stride, stride )
+ + cost_fpel_mvx[xs[i]];
+ if( sad < bsad*sad_thresh>>3 )
+ {
+ COPY1_IF_LT( bsad, sad );
+ mvsads[nmvsad].sad = sad + ycost;
+ mvsads[nmvsad].mx = mx;
+ mvsads[nmvsad].my = my;
+ nmvsad++;
+ }
+ }
+ bsad += ycost;
+ }
+
+ limit = i_me_range / 2;
+ if( nmvsad > limit*2 )
+ {
+ // halve the range if the domain is too large... eh, close enough
+ bsad = bsad*(sad_thresh+8)>>4;
+ for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
+ for( j=i; j<nmvsad; j++ )
+ if( mvsads[j].sad <= bsad )
+ mvsads[i++] = mvsads[j];
+ nmvsad = i;
+ }
+ if( nmvsad > limit )
+ {
+ for( i=0; i<limit; i++ )
+ {
+ int bj = i;
+ int bsad = mvsads[bj].sad;
+ for( j=i+1; j<nmvsad; j++ )
+ COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
+ if( bj > i )
+ XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+ }
+ nmvsad = limit;
+ }
+ for( i=0; i<nmvsad; i++ )
+ COST_MV( mvsads[i].mx, mvsads[i].my );
+ x264_free( mvsads );
+ }
+ else
{
- bcost -= p_cost_mvy[my<<2];
- xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
- cost_fpel_mvx+min_x, xs, width, bcost );
- for( i=0; i<xn-2; i+=3 )
- COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
- bcost += p_cost_mvy[my<<2];
- for( ; i<xn; i++ )
- COST_MV( min_x+xs[i], my );
+ // just ADS and SAD
+ for( my = min_y; my <= max_y; my++ )
+ {
+ bcost -= p_cost_mvy[my<<2];
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+ cost_fpel_mvx+min_x, xs, width, bcost );
+ for( i=0; i<xn-2; i+=3 )
+ COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
+ bcost += p_cost_mvy[my<<2];
+ for( ; i<xn; i++ )
+ COST_MV( min_x+xs[i], my );
+ }
}
if( xs != xs_buf )
{ \
int stride = 16; \
uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \
- int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
}
src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh );
src1 = src0 + stride;
src3 = src2 + 1;
- h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
+ h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 );
COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 );
COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-2] + p_cost_mvy[omy ], bmx, omx-2, bmy, omy );