there should now be no memory allocation outside of init-time.
} stat;
+ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
+
/* CPU functions dependents */
x264_predict_t predict_16x16[4+3];
x264_predict_t predict_8x8c[4+3];
*****************************************************************************/
#include "common.h"
+#include "encoder/me.h"
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int16_t mvp[2] )
{
h->mb.i_neighbour4[15] =
h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT;
+ int buf_hpel = (h->param.i_width+40) * sizeof(int16_t);
+ int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
+ int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
+ int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
+ ((me_range*2+18) * sizeof(int16_t) + (me_range+1) * (me_range+1) * 4 * sizeof(mvsad_t));
+ CHECKED_MALLOC( h->scratch_buffer, X264_MAX3( buf_hpel, buf_ssim, buf_tesa ) );
+
return 0;
fail: return -1;
}
x264_free( h->mb.skipbp );
x264_free( h->mb.cbp );
x264_free( h->mb.qp );
+ x264_free( h->scratch_buffer );
}
void x264_macroblock_slice_init( x264_t *h )
{
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
- int stride, int width, int height )
+ int stride, int width, int height, int16_t *buf )
{
- int16_t *buf = x264_malloc((width+5)*sizeof(int16_t));
int x, y;
for( y=0; y<height; y++ )
{
dstc += stride;
src += stride;
}
- x264_free(buf);
}
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
frame->filtered[2] + offs,
frame->filtered[3] + offs,
frame->plane[0] + offs,
- stride, width + 16, height - start );
+ stride, width + 16, height - start,
+ h->scratch_buffer );
}
/* generate integral image:
uint8_t *src, int i_src, int w, int h);
void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
- int i_stride, int i_width, int i_height );
+ int i_stride, int i_width, int i_height, int16_t *buf );
/* prefetch the next few macroblocks of fenc or fdec */
void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
uint8_t *pix1, int stride1,
uint8_t *pix2, int stride2,
- int width, int height )
+ int width, int height, void *buf )
{
int x, y, z;
float ssim = 0.0;
- int (*sum0)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
- int (*sum1)[4] = x264_malloc(4 * (width/4+3) * sizeof(int));
+ int (*sum0)[4] = buf;
+ int (*sum1)[4] = sum0 + width/4+3;
width >>= 2;
height >>= 2;
z = 0;
for( x = 0; x < width-1; x += 4 )
ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
}
- x264_free(sum0);
- x264_free(sum1);
return ssim;
}
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
-float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
+float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height, void *buf );
#endif
void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
void x264_sfence( void );\
static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
- int stride, int width, int height )\
+ int stride, int width, int height, int16_t *buf )\
{\
- int16_t *buf;\
int realign = (long)src & (align-1);\
src -= realign;\
dstv -= realign;\
dstc -= realign;\
dsth -= realign;\
width += realign;\
- buf = x264_malloc((width+16)*sizeof(int16_t));\
while( height-- )\
{\
x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
src += stride;\
}\
x264_sfence();\
- x264_free(buf);\
}
HPEL(8, mmxext, mmxext, mmxext, mmxext)
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
#ifdef ARCH_X86_64
-void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
-void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height );
+void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, sse2, ssse3, ssse3)
x264_pixel_ssim_wxh( &h->pixf,
h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
- h->param.i_width-2, max_y-min_y );
+ h->param.i_width-2, max_y-min_y, h->scratch_buffer );
}
}
DECLARE_ALIGNED_16( int enc_dc[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
- int16_t xs_buf[64];
- int16_t *xs = width<=64 ? xs_buf : x264_malloc( (width+15)*sizeof(int16_t) );
+ int16_t *xs = h->scratch_buffer;
int xn;
uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
if( h->mb.i_me_method == X264_ME_TESA )
{
// ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
- typedef struct {
- int sad;
- int16_t mx, my;
- } mvsad_t;
- mvsad_t *mvsads = x264_malloc( width*(max_y-min_y+1)*sizeof(mvsad_t) );
+ mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
}
for( i=0; i<nmvsad; i++ )
COST_MV( mvsads[i].mx, mvsads[i].my );
- x264_free( mvsads );
}
else
{
COST_MV( min_x+xs[i], my );
}
}
-
- if( xs != xs_buf )
- x264_free( xs );
#endif
}
break;
DECLARE_ALIGNED_4( int16_t mv[2] );
} DECLARE_ALIGNED_16( x264_me_t );
+typedef struct {
+ int sad;
+ int16_t mx, my;
+} mvsad_t;
+
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
static inline void x264_me_search( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc )
{ x264_me_search_ref( h, m, mvc, i_mvc, NULL ); }
int sums[5][4] = {{0}};
used_asm = ok = 1;
x264_emms();
- res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 );
- res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
+ res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
+ res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
if( fabs(res_c - res_a) > 1e-6 )
{
ok = 0;
uint8_t *src = buf1+8+2*64;
uint8_t *dstc[3] = { buf3+8, buf3+8+16*64, buf3+8+32*64 };
uint8_t *dsta[3] = { buf4+8, buf4+8+16*64, buf4+8+32*64 };
+ void *tmp = buf3+49*64;
set_func_name( "hpel_filter" );
ok = 1; used_asm = 1;
memset( buf3, 0, 4096 );
memset( buf4, 0, 4096 );
- call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10 );
- call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10 );
+ call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10, tmp );
+ call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10, tmp );
for( i=0; i<3; i++ )
for( j=0; j<10; j++ )
//FIXME ideally the first pixels would match too, but they aren't actually used