Fixes some extremely rare threading race conditions and makes the code cleaner.
Downside: slightly higher memory usage when calling multiple encoders from the same application.
#define X264_THREAD_MAX 128
#define X264_PCM_COST (386*8)
#define X264_LOOKAHEAD_MAX 250
+// arbitrary, but low because SATD scores are 1/4 normal
+#define X264_LOOKAHEAD_QP 12
// number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
+ /* mv/ref cost arrays. Indexed by lambda instead of
+ * qp because, due to rounding, some quantizers share
+ * lambdas. This saves memory. */
+ uint16_t *cost_mv[92];
+ uint16_t *cost_mv_fpel[92][4];
+
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
ALIGNED_16( uint32_t nr_residual_sum[2][64] );
#define x264_pthread_attr_t pthread_attr_t
#define x264_pthread_attr_init pthread_attr_init
#define x264_pthread_attr_destroy pthread_attr_destroy
+#define X264_PTHREAD_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
#else
#define x264_pthread_mutex_t int
#define x264_pthread_mutex_init(m,f) 0
#define x264_pthread_attr_t int
#define x264_pthread_attr_init(a) 0
#define x264_pthread_attr_destroy(a)
+#define X264_PTHREAD_MUTEX_INITIALIZER 0
#endif
#define WORD_SIZE sizeof(void*)
int i_lambda;
int i_lambda2;
int i_qp;
- int16_t *p_cost_mv;
+ uint16_t *p_cost_mv;
uint16_t *p_cost_ref0;
uint16_t *p_cost_ref1;
int i_mbrd;
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
-/* Indexed by lambda instead of qp because, due to rounding,
- * some quantizers share lambdas. This saves memory. */
-uint16_t *x264_cost_mv_fpel[92][4];
-uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[92][3][33];
+static x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
-/* initialize an array of lambda*nbits for all possible mvs */
-static int x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+int x264_analyse_init_costs( x264_t *h, int qp )
{
- static int16_t *p_cost_mv[92];
int i, j;
-
- if( !p_cost_mv[a->i_lambda] )
- {
- x264_emms();
- /* could be faster, but isn't called many times */
- /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
- CHECKED_MALLOC( p_cost_mv[a->i_lambda], (4*4*2048 + 1) * sizeof(int16_t) );
- p_cost_mv[a->i_lambda] += 2*4*2048;
- for( i = 0; i <= 2*4*2048; i++ )
- {
- p_cost_mv[a->i_lambda][-i] =
- p_cost_mv[a->i_lambda][i] = a->i_lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
- }
- for( i = 0; i < 3; i++ )
- for( j = 0; j < 33; j++ )
- x264_cost_ref[a->i_lambda][i][j] = i ? a->i_lambda * bs_size_te( i, j ) : 0;
- }
- a->p_cost_mv = p_cost_mv[a->i_lambda];
- a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
- a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
-
- /* FIXME is this useful for all me methods? */
- if( h->param.analyse.i_me_method >= X264_ME_ESA && !x264_cost_mv_fpel[a->i_lambda][0] )
+ int lambda = x264_lambda_tab[qp];
+ if( h->cost_mv[lambda] )
+ return 0;
+ /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
+ CHECKED_MALLOC( h->cost_mv[lambda], (4*4*2048 + 1) * sizeof(uint16_t) );
+ h->cost_mv[lambda] += 2*4*2048;
+ for( i = 0; i <= 2*4*2048; i++ )
+ {
+ h->cost_mv[lambda][-i] =
+ h->cost_mv[lambda][i] = lambda * (log2f(i+1)*2 + 0.718f + !!i) + .5f;
+ }
+ x264_pthread_mutex_lock( &cost_ref_mutex );
+ for( i = 0; i < 3; i++ )
+ for( j = 0; j < 33; j++ )
+ x264_cost_ref[lambda][i][j] = i ? lambda * bs_size_te( i, j ) : 0;
+ x264_pthread_mutex_unlock( &cost_ref_mutex );
+ if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[lambda][0] )
{
for( j=0; j<4; j++ )
{
- CHECKED_MALLOC( x264_cost_mv_fpel[a->i_lambda][j], (4*2048 + 1) * sizeof(int16_t) );
- x264_cost_mv_fpel[a->i_lambda][j] += 2*2048;
+ CHECKED_MALLOC( h->cost_mv_fpel[lambda][j], (4*2048 + 1) * sizeof(uint16_t) );
+ h->cost_mv_fpel[lambda][j] += 2*2048;
for( i = -2*2048; i < 2*2048; i++ )
- x264_cost_mv_fpel[a->i_lambda][j][i] = p_cost_mv[a->i_lambda][i*4+j];
+ h->cost_mv_fpel[lambda][j][i] = h->cost_mv[lambda][i*4+j];
}
}
return 0;
return -1;
}
+void x264_analyse_free_costs( x264_t *h )
+{
+ int i, j;
+ for( i = 0; i < 92; i++ )
+ {
+ if( h->cost_mv[i] )
+ x264_free( h->cost_mv[i] - 2*4*2048 );
+ if( h->cost_mv_fpel[i][0] )
+ for( j = 0; j < 4; j++ )
+ x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
+ }
+}
+
+/* initialize an array of lambda*nbits for all possible mvs */
+static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
+{
+ a->p_cost_mv = h->cost_mv[a->i_lambda];
+ a->p_cost_ref0 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+ a->p_cost_ref1 = x264_cost_ref[a->i_lambda][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+}
+
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
{
int i = h->param.analyse.i_subpel_refine - (h->sh.i_type == SLICE_TYPE_B);
/*****************************************************************************
* x264_macroblock_analyse:
*****************************************************************************/
-int x264_macroblock_analyse( x264_t *h )
+void x264_macroblock_analyse( x264_t *h )
{
x264_mb_analysis_t analysis;
int i_cost = COST_MAX;
int i_thresh16x8;
int i_satd_inter, i_satd_intra;
- if( x264_mb_analyse_load_costs( h, &analysis ) )
- return -1;
+ x264_mb_analyse_load_costs( h, &analysis );
x264_mb_analyse_inter_p16x16( h, &analysis );
if( h->mb.i_type == P_SKIP )
- return 0;
+ return;
if( flags & X264_ANALYSE_PSUB16x16 )
{
int i_satd_inter;
h->mb.b_skip_mc = 0;
- if( x264_mb_analyse_load_costs( h, &analysis ) )
- return -1;
+ x264_mb_analyse_load_costs( h, &analysis );
/* select best inter mode */
/* direct must be first */
{
h->mb.i_type = B_SKIP;
x264_analyse_update_cache( h, &analysis );
- return 0;
+ return;
}
}
x264_psy_trellis_init( h, 0 );
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
h->mb.i_skip_intra = 0;
- return 0;
}
/*-------------------- Update MB from the analysis ----------------------*/
#ifndef X264_ANALYSE_H
#define X264_ANALYSE_H
-int x264_macroblock_analyse( x264_t *h );
+int x264_analyse_init_costs( x264_t *h, int qp );
+void x264_analyse_free_costs( x264_t *h );
+void x264_macroblock_analyse( x264_t *h );
void x264_slicetype_decide( x264_t *h );
-int x264_lowres_context_alloc( x264_t *h );
void x264_slicetype_analyse( x264_t *h, int keyframe );
{
x264_t *h;
char buf[1000], *p;
- int i, i_slicetype_length;
+ int i, qp, i_slicetype_length;
CHECKED_MALLOCZERO( h, sizeof(x264_t) );
p += sprintf( p, " none!" );
x264_log( h, X264_LOG_INFO, "%s\n", buf );
+ for( qp = h->param.rc.i_qp_min; qp <= h->param.rc.i_qp_max; qp++ )
+ if( x264_analyse_init_costs( h, qp ) )
+ goto fail;
+ if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
+ goto fail;
+
h->out.i_nal = 0;
h->out.i_bitstream = X264_MAX( 1000000, h->param.i_width * h->param.i_height * 4
* ( h->param.rc.i_rc_method == X264_RC_ABR ? pow( 0.95, h->param.rc.i_qp_min )
if( x264_ratecontrol_new( h ) < 0 )
goto fail;
- if( x264_lowres_context_alloc( h ) )
- goto fail;
-
if( h->param.psz_dump_yuv )
{
/* create or truncate the reconstructed video file */
/* load cache */
x264_macroblock_cache_load( h, i_mb_x, i_mb_y );
- /* analyse parameters
- * Slice I: choose I_4x4 or I_16x16 mode
- * Slice P: choose between using P mode or intra (4x4 or 16x16)
- * */
- if( x264_macroblock_analyse( h ) )
- return -1;
+ x264_macroblock_analyse( h );
/* encode this macroblock -> be careful it can change the mb type to P_SKIP if needed */
x264_macroblock_encode( h );
x264_cqm_delete( h );
+ x264_analyse_free_costs( h );
+
if( h->param.i_threads > 1)
h = h->thread[ h->i_thread_phase % h->param.i_threads ];
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
- const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
- const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
/* hexagon grid */
omx = bmx; omy = bmy;
- const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
- const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
+ const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
+ const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
i = 1;
do
{
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
- uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+ uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
- const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
- const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
- const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
- const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
- const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
+ const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
+ const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
+ const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
+ const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
- const int16_t *p_cost_mvx, *p_cost_mvy;
+ const uint16_t *p_cost_mvx, *p_cost_mvy;
const int bw = x264_pixel_size[m->i_pixel].w>>2;
const int bh = x264_pixel_size[m->i_pixel].h>>2;
const int i_pixel = m->i_pixel;
{
/* input */
int i_pixel; /* PIXEL_WxH */
- int16_t *p_cost_mv; /* lambda * nbits for each possible mv */
+ uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
}
}
+ q = x264_clip3f( q, h->param.rc.i_qp_min, h->param.rc.i_qp_max );
+
rc->qpa_rc =
rc->qpa_aq = 0;
h->fdec->f_qp_avg_rc =
#include "me.h"
-static int x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
+static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
{
- a->i_qp = 12; // arbitrary, but low because SATD scores are 1/4 normal
+ a->i_qp = X264_LOOKAHEAD_QP;
a->i_lambda = x264_lambda_tab[ a->i_qp ];
- if( x264_mb_analyse_load_costs( h, a ) )
- return -1;
+ x264_mb_analyse_load_costs( h, a );
h->mb.i_me_method = X264_MIN( X264_ME_HEX, h->param.analyse.i_me_method ); // maybe dia?
h->mb.i_subpel_refine = 4; // 3 should be enough, but not tweaking for speed now
h->mb.b_chroma_me = 0;
- return 0;
-}
-
-int x264_lowres_context_alloc( x264_t *h )
-{
- x264_mb_analysis_t a;
- return x264_lowres_context_init( h, &a );
}
static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
" where <option> is either\n"
" q=<integer> (force QP)\n"
" or b=<float> (bitrate multiplier)\n" );
- H1( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
+ H2( " --qpfile <string> Force frametypes and QPs for some or all frames\n"
" Format of each line: framenumber frametype QP\n"
- " QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n" );
+ " QP of -1 lets x264 choose. Frametypes: I,i,P,B,b.\n"
+ " QPs are restricted by qpmin/qpmax.\n" );
H1( "\n" );
H1( "Analysis:\n" );
H1( "\n" );