From: Loren Merritt Date: Fri, 3 Jun 2005 05:33:15 +0000 (+0000) Subject: 8x8 transform and 8x8 intra prediction. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1ab45c8f7411f7b4453ddff66919910e823ed33b;p=libx264 8x8 transform and 8x8 intra prediction. (backend only, not yet used by mb analysis) git-svn-id: svn://svn.videolan.org/x264/trunk@246 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/predict-a.asm b/common/amd64/predict-a.asm index 979baf26..96e7593a 100644 --- a/common/amd64/predict-a.asm +++ b/common/amd64/predict-a.asm @@ -48,17 +48,17 @@ BITS 64 SECTION .text -cglobal predict_8x8_v_mmx +cglobal predict_8x8c_v_mmx cglobal predict_16x16_v_mmx ;----------------------------------------------------------------------------- ; -; void predict_8x8_v_mmx( uint8_t *src, int i_stride ) +; void predict_8x8c_v_mmx( uint8_t *src, int i_stride ) ; ;----------------------------------------------------------------------------- ALIGN 16 -predict_8x8_v_mmx : +predict_8x8c_v_mmx : movsxd rcx, esi ; i_stride sub rdi , rcx ; esi <-- line -1 diff --git a/common/cabac.c b/common/cabac.c index 03f52373..7583c741 100644 --- a/common/cabac.c +++ b/common/cabac.c @@ -35,7 +35,7 @@ static int binCount = 0; #endif -static const int x264_cabac_context_init_I[399][2] = +static const int x264_cabac_context_init_I[460][2] = { /* 0 - 10 */ { 20, -15 }, { 2, 54 }, { 3, 74 }, { 20, -15 }, @@ -181,10 +181,30 @@ static const int x264_cabac_context_init_I[399][2] = { 31, -7 }, { 35, -15 }, { 34, -3 }, { 34, 3 }, { 36, -1 }, { 34, 5 }, { 32, 11 }, { 35, 5 }, { 34, 12 }, { 39, 11 }, { 30, 29 }, { 34, 26 }, - { 29, 39 }, { 19, 66 } + { 29, 39 }, { 19, 66 }, + + /* 399 -> 435 */ + { 31, 21 }, { 31, 31 }, { 25, 50 }, + { -17, 120 }, { -20, 112 }, { -18, 114 }, { -11, 85 }, + { -15, 92 }, { -14, 89 }, { -26, 71 }, { -15, 81 }, + { -14, 80 }, { 0, 68 }, { -14, 70 }, { -24, 56 }, + { -23, 68 }, { -24, 50 }, { -11, 74 }, { 23, -13 }, + { 26, -13 }, { 40, -15 }, { 49, -14 }, { 44, 3 }, + { 45, 6 }, { 44, 34 }, { 33, 54 }, { 19, 82 }, + { -3, 75 }, { -1, 23 }, { 1, 34 }, { 1, 43 }, + { 0, 54 }, { -2, 55 }, { 0, 61 }, { 1, 64 }, + { 0, 68 }, { -9, 92 }, + + /* 436 -> 459 */ + { -14, 106 }, { -13, 97 }, { -15, 90 }, { -12, 90 }, + { -18, 88 }, { -10, 73 }, { -9, 79 }, { -14, 86 }, + { -10, 73 }, { -10, 70 }, { -10, 69 }, { -5, 66 }, + { -9, 64 }, { -5, 58 }, { 2, 59 }, { 21, -10 }, + { 24, -11 }, { 28, -8 }, { 28, -1 }, { 29, 3 }, + { 29, 9 }, { 35, 20 }, { 29, 36 }, { 14, 67 } }; -static const int x264_cabac_context_init_PB[3][399][2] = +static const int x264_cabac_context_init_PB[3][460][2] = { /* i_cabac_init_idc == 0 */ { @@ -321,7 +341,25 @@ static const int x264_cabac_context_init_PB[3][399][2] = { 23, 42 }, { 19, 57 }, { 22, 53 }, { 22, 61 }, { 11, 86 }, - + /* 399 -> 435 */ + { 12, 40 }, { 11, 51 }, { 14, 59 }, + { -4, 79 }, { -7, 71 }, { -5, 69 }, { -9, 70 }, + { -8, 66 }, { -10, 68 }, { -19, 73 }, { -12, 69 }, + { -16, 70 }, { -15, 67 }, { -20, 62 }, { -19, 70 }, + { -16, 66 }, { -22, 65 }, { -20, 63 }, { 9, -2 }, + { 26, -9 }, { 33, -9 }, { 39, -7 }, { 41, -2 }, + { 45, 3 }, { 49, 9 }, { 45, 27 }, { 36, 59 }, + { -6, 66 }, { -7, 35 }, { -7, 42 }, { -8, 45 }, + { -5, 48 }, { -12, 56 }, { -6, 60 }, { -5, 62 }, + { -8, 66 }, { -8, 76 }, + + /* 436 -> 459 */ + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 83 }, + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 21, -13 }, + { 33, -14 }, { 39, -7 }, { 46, -2 }, { 51, 2 }, + { 60, 6 }, { 61, 17 }, { 55, 34 }, { 42, 62 }, }, /* i_cabac_init_idc == 1 */ @@ -459,6 +497,25 @@ static const int x264_cabac_context_init_PB[3][399][2] = { 18, 50 }, { 12, 70 }, { 21, 54 }, { 14, 71 }, { 11, 83 }, + /* 399 -> 435 */ + { 24, 32 }, { 21, 49 }, { 21, 54 }, + { -5, 85 }, { -6, 81 }, { -10, 77 }, { -7, 81 }, + { -17, 80 }, { -18, 73 }, { -4, 74 }, { -10, 8 }, + { -9, 71 }, { -9, 67 }, { -1, 61 }, { -8, 66 }, + { -14, 66 }, { 0, 59 }, { 2, 59 }, { 17, -10 }, + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, + { -5, 71 }, { 0, 24 }, { -1, 36 }, { -2, 42 }, + { -2, 52 }, { -9, 57 }, { -6, 53 }, { -4, 65 }, + { -4, 67 }, { -7, 82 }, + + /* 436 -> 459 */ + { -3, 81 }, { -3, 76 }, { -7, 72 }, { -6, 78 }, + { -12, 72 }, { -14, 68 }, { -3, 70 }, { -6, 76 }, + { -5, 66 }, { -5, 62 }, { 0, 57 }, { -4, 61 }, + { -9, 60 }, { 1, 54 }, { 2, 58 }, { 17, -10 }, + { 32, -13 }, { 42, -9 }, { 49, -5 }, { 53, 0 }, + { 64, 3 }, { 68, 10 }, { 66, 27 }, { 47, 57 }, }, /* i_cabac_init_idc == 2 */ @@ -595,6 +652,26 @@ static const int x264_cabac_context_init_PB[3][399][2] = { 22, 42 }, { 16, 60 }, { 15, 52 }, { 14, 60 }, { 3, 78 }, { -16, 123 }, { 21, 53 }, { 22, 56 }, { 25, 61 }, + + /* 399 -> 435 */ + { 21, 33 }, { 19, 50 }, { 17, 61 }, + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, + { -9, 71 }, { -7, 37 }, { -8, 44 }, { -11, 49 }, + { -10, 56 }, { -12, 59 }, { -8, 63 }, { -9, 67 }, + { -6, 68 }, { -10, 79 }, + + /* 436 -> 459 */ + { -3, 78 }, { -8, 74 }, { -9, 72 }, { -10, 72 }, + { -18, 75 }, { -12, 71 }, { -11, 63 }, { -5, 70 }, + { -17, 75 }, { -14, 72 }, { -16, 67 }, { -8, 53 }, + { -14, 59 }, { -9, 52 }, { -11, 68 }, { 9, -2 }, + { 30, -10 }, { 31, -4 }, { 33, -1 }, { 33, 7 }, + { 31, 12 }, { 37, 23 }, { 31, 38 }, { 20, 64 }, } }; @@ -720,7 +797,7 @@ static const int x264_cabac_entropy[128] = *****************************************************************************/ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int i_model ) { - const int (*cabac_context_init)[399][2]; + const int (*cabac_context_init)[460][2]; int i; if( i_slice_type == SLICE_TYPE_I ) @@ -732,7 +809,7 @@ void x264_cabac_context_init( x264_cabac_t *cb, int i_slice_type, int i_qp, int cabac_context_init = &x264_cabac_context_init_PB[i_model]; } - for( i = 0; i < 399; i++ ) + for( i = 0; i < 436; i++ ) { int i_pre_state; @@ -865,7 +942,7 @@ void x264_cabac_model_update( x264_cabac_t *cb, int i_slice_type, int i_qp ) i_cost = 0; /* fix8 */ - for( i_ctx = 0; i_ctx < 399; i_ctx++ ) + for( i_ctx = 0; i_ctx < 436; i_ctx++ ) { int i_weight; int i_model_state; diff --git a/common/cabac.h b/common/cabac.h index 3051789d..945fb17a 100644 --- a/common/cabac.h +++ b/common/cabac.h @@ -34,12 +34,13 @@ typedef struct } slice[3]; /* context */ + /* states 436-459 are for interlacing, so are omitted for now */ struct { int i_state; int i_mps; int i_count; - } ctxstate[399]; + } ctxstate[436]; /* state */ int i_low; diff --git a/common/common.c b/common/common.c index eabb20b2..20b1c2bb 100644 --- a/common/common.c +++ b/common/common.c @@ -104,7 +104,7 @@ void x264_param_default( x264_param_t *param ) param->i_log_level = X264_LOG_INFO; /* */ - param->analyse.intra = X264_ANALYSE_I4x4; + param->analyse.intra = X264_ANALYSE_I4x4 | X264_ANALYSE_I8x8; param->analyse.inter = X264_ANALYSE_I4x4 | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_TEMPORAL; param->analyse.i_me_method = X264_ME_HEX; diff --git a/common/common.h b/common/common.h index 2ad1543b..0cd53ae1 100644 --- a/common/common.h +++ b/common/common.h @@ -289,6 +289,8 @@ struct x264_t { DECLARE_ALIGNED( int, luma16x16_dc[16], 16 ); DECLARE_ALIGNED( int, chroma_dc[2][4], 16 ); + // FIXME merge with union + DECLARE_ALIGNED( int, luma8x8[4][64], 16 ); union { DECLARE_ALIGNED( int, residual_ac[15], 16 ); @@ -326,6 +328,8 @@ struct x264_t /* neighboring MBs */ unsigned int i_neighbour; + unsigned int i_neighbour8[4]; /* neighbours of each 8x8 or 4x4 block that are available */ + unsigned int i_neighbour4[16]; /* at the time the block is coded */ int i_mb_type_top; int i_mb_type_left; int i_mb_type_topleft; @@ -343,11 +347,13 @@ struct x264_t int8_t *ref[2]; /* mb ref. set to -1 if non used (intra or Lx only) */ int16_t (*mvr[2][16])[2]; /* 16x16 mv for each possible ref */ int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */ + int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */ /* current value */ int i_type; int i_partition; int i_sub_partition[4]; + int b_transform_8x8; int i_cbp_luma; int i_cbp_chroma; @@ -373,7 +379,7 @@ struct x264_t /* cache */ struct { - /* real intra4x4_pred_mode if I_4X4, I_PRED_4x4_DC if mb available, -1 if not */ + /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */ int intra4x4_pred_mode[X264_SCAN8_SIZE]; /* i_non_zero_count if availble else 0x80 */ @@ -391,6 +397,9 @@ struct x264_t int16_t direct_mv[2][X264_SCAN8_SIZE][2]; int8_t direct_ref[2][X264_SCAN8_SIZE]; + + /* top and left neighbors. 1=>8x8, 0=>4x4 */ + int8_t transform_size[2]; } cache; /* */ @@ -427,7 +436,7 @@ struct x264_t /* ? */ int i_misc_bits; /* MB type counts */ - int i_mb_count[18]; + int i_mb_count[19]; int i_mb_count_p; int i_mb_count_skip; /* Estimated (SATD) cost as Intra/Predicted frame */ @@ -449,13 +458,14 @@ struct x264_t float f_psnr_mean_u[5]; float f_psnr_mean_v[5]; /* */ - int64_t i_mb_count[5][18]; + int64_t i_mb_count[5][19]; } stat; /* CPU functions dependants */ x264_predict_t predict_16x16[4+3]; - x264_predict_t predict_8x8[4+3]; + x264_predict_t predict_8x8c[4+3]; + x264_predict8x8_t predict_8x8[9+3]; x264_predict_t predict_4x4[9+3]; x264_pixel_function_t pixf; diff --git a/common/dct.c b/common/dct.c index 4cb59722..7c6b2b2a 100644 --- a/common/dct.c +++ b/common/dct.c @@ -256,6 +256,136 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ) add8x8_idct( &p_dst[8*i_dst+8], i_dst, &dct[12] ); } +/**************************************************************************** + * 8x8 transform: + ****************************************************************************/ + +static inline void dct8_1d( int16_t src[8][8], int16_t dst[8][8] ) +{ + int i; + for( i = 0; i < 8; i++ ) + { + const int s07 = src[i][0] + src[i][7]; + const int s16 = src[i][1] + src[i][6]; + const int s25 = src[i][2] + src[i][5]; + const int s34 = src[i][3] + src[i][4]; + + const int a0 = s07 + s34; + const int a1 = s16 + s25; + const int a2 = s07 - s34; + const int a3 = s16 - s25; + + const int d07 = src[i][0] - src[i][7]; + const int d16 = src[i][1] - src[i][6]; + const int d25 = src[i][2] - src[i][5]; + const int d34 = src[i][3] - src[i][4]; + + const int a4 = d16 + d25 + (d07 + (d07>>1)); + const int a5 = d07 - d34 - (d25 + (d25>>1)); + const int a6 = d07 + d34 - (d16 + (d16>>1)); + const int a7 = d16 - d25 + (d34 + (d34>>1)); + + dst[0][i] = a0 + a1; + dst[1][i] = a4 + (a7>>2); + dst[2][i] = a2 + (a3>>1); + dst[3][i] = a5 + (a6>>2); + dst[4][i] = a0 - a1; + dst[5][i] = a6 - (a5>>2); + dst[6][i] = (a2>>1) - a3; + dst[7][i] = (a4>>2) - a7; + } +} + +static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +{ + int16_t d[8][8]; + int16_t tmp[8][8]; + int y, x; + + for( y = 0; y < 8; y++ ) + { + for( x = 0; x < 8; x++ ) + { + d[y][x] = pix1[x] - pix2[x]; + } + pix1 += i_pix1; + pix2 += i_pix2; + } + + dct8_1d( d, tmp ); + dct8_1d( tmp, dct ); +} + +static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) +{ + sub8x8_dct8( dct[0], pix1, i_pix1, pix2, i_pix2 ); + sub8x8_dct8( dct[1], &pix1[8], i_pix1, &pix2[8], i_pix2 ); + sub8x8_dct8( dct[2], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ); + sub8x8_dct8( dct[3], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ); +} + +static inline void idct8_1d( int16_t src[8][8], int16_t dst[8][8] ) +{ + int i; + for( i = 0; i < 8; i++ ) + { + const int a0 = src[i][0] + src[i][4]; + const int a2 = src[i][0] - src[i][4]; + const int a4 = (src[i][2]>>1) - src[i][6]; + const int a6 = (src[i][6]>>1) + src[i][2]; + + const int b0 = a0 + a6; + const int b2 = a2 + a4; + const int b4 = a2 - a4; + const int b6 = a0 - a6; + + const int a1 = -src[i][3] + src[i][5] - src[i][7] - (src[i][7]>>1); + const int a3 = src[i][1] + src[i][7] - src[i][3] - (src[i][3]>>1); + const int a5 = -src[i][1] + src[i][7] + src[i][5] + (src[i][5]>>1); + const int a7 = src[i][3] + src[i][5] + src[i][1] + (src[i][1]>>1); + + const int b1 = (a7>>2) + a1; + const int b3 = a3 + (a5>>2); + const int b5 = (a3>>2) - a5; + const int b7 = a7 - (a1>>2); + + dst[0][i] = b0 + b7; + dst[7][i] = b0 - b7; + dst[1][i] = b2 + b5; + dst[6][i] = b2 - b5; + dst[2][i] = b4 + b3; + dst[5][i] = b4 - b3; + dst[3][i] = b6 + b1; + dst[4][i] = b6 - b1; + } +} + +static void add8x8_idct8( uint8_t *p_dst, int i_dst, int16_t dct[8][8] ) +{ + int16_t d[8][8]; + int16_t tmp[8][8]; + int y, x; + + idct8_1d( dct, tmp ); + idct8_1d( tmp, d ); + + for( y = 0; y < 8; y++ ) + { + for( x = 0; x < 8; x++ ) + { + p_dst[x] = clip_uint8( p_dst[x] + ((d[y][x] + 32) >> 6) ); + } + p_dst += i_dst; + } +} + +static void add16x16_idct8( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] ) +{ + add8x8_idct8( &p_dst[0], i_dst, dct[0] ); + add8x8_idct8( &p_dst[8], i_dst, dct[1] ); + add8x8_idct8( &p_dst[8*i_dst], i_dst, dct[2] ); + add8x8_idct8( &p_dst[8*i_dst+8], i_dst, dct[3] ); +} /**************************************************************************** @@ -269,8 +399,14 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub8x8_dct = sub8x8_dct; dctf->add8x8_idct = add8x8_idct; - dctf->sub16x16_dct = sub16x16_dct; - dctf->add16x16_idct = add16x16_idct; + dctf->sub16x16_dct = sub16x16_dct; + dctf->add16x16_idct = add16x16_idct; + + dctf->sub8x8_dct8 = sub8x8_dct8; + dctf->add8x8_idct8 = add8x8_idct8; + + dctf->sub16x16_dct8 = sub16x16_dct8; + dctf->add16x16_idct8 = add16x16_idct8; dctf->dct4x4dc = dct4x4dc; dctf->idct4x4dc = idct4x4dc; diff --git a/common/dct.h b/common/dct.h index bedbbf43..c0493137 100644 --- a/common/dct.h +++ b/common/dct.h @@ -35,6 +35,11 @@ typedef struct void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); void (*add16x16_idct) ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] ); + void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); + void (*add8x8_idct8) ( uint8_t *p_dst, int i_dst, int16_t dct[8][8] ); + + void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ); + void (*add16x16_idct8) ( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] ); void (*dct4x4dc) ( int16_t d[4][4] ); void (*idct4x4dc)( int16_t d[4][4] ); diff --git a/common/frame.c b/common/frame.c index 92c0ffdd..69a42b8a 100644 --- a/common/frame.c +++ b/common/frame.c @@ -644,6 +644,18 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ) const int mb_4x4 = 4 * s4x4 * mb_y + 4 * mb_x; int i_edge; int i_dir; + const int b_8x8_transform = h->mb.mb_transform_size[mb_xy]; + + /* cavlc + 8x8 transform stores nnz per 16 coeffs for the purpose of + * entropy coding, but per 64 coeffs for the purpose of deblocking */ + if( !h->param.b_cabac && b_8x8_transform ) + { + uint32_t *nnz = (uint32_t*)h->mb.non_zero_count[mb_xy]; + if( nnz[0] ) nnz[0] = 0x01010101; + if( nnz[1] ) nnz[1] = 0x01010101; + if( nnz[2] ) nnz[2] = 0x01010101; + if( nnz[3] ) nnz[3] = 0x01010101; + } /* i_dir == 0 -> vertical edge * i_dir == 1 -> horizontal edge */ @@ -719,9 +731,12 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ) if( i_dir == 0 ) { /* vertical edge */ - deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge], - h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1); - if( (i_edge % 2) == 0 ) + if( !b_8x8_transform || !(i_edge & 1) ) + { + deblocking_filter_edgev( h, &h->fdec->plane[0][16 * mb_y * h->fdec->i_stride[0]+ 16 * mb_x + 4 * i_edge], + h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1); + } + if( !(i_edge & 1) ) { /* U/V planes */ int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + @@ -735,10 +750,13 @@ void x264_frame_deblocking_filter( x264_t *h, int i_slice_type ) else { /* horizontal edge */ - deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x], - h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 ); + if( !b_8x8_transform || !(i_edge & 1) ) + { + deblocking_filter_edgeh( h, &h->fdec->plane[0][(16*mb_y + 4 * i_edge) * h->fdec->i_stride[0]+ 16 * mb_x], + h->fdec->i_stride[0], bS, (i_qp+i_qpn+1) >> 1 ); + } /* U/V planes */ - if( ( i_edge % 2 ) == 0 ) + if( !(i_edge & 1) ) { int i_qpc = ( i_chroma_qp_table[x264_clip3( i_qp + h->pps->i_chroma_qp_index_offset, 0, 51 )] + i_chroma_qp_table[x264_clip3( i_qpn + h->pps->i_chroma_qp_index_offset, 0, 51 )] + 1 ) >> 1; diff --git a/common/i386/predict-a.asm b/common/i386/predict-a.asm index 3237ebb6..3be28a1c 100644 --- a/common/i386/predict-a.asm +++ b/common/i386/predict-a.asm @@ -56,17 +56,17 @@ SECTION .data SECTION .text -cglobal predict_8x8_v_mmx +cglobal predict_8x8c_v_mmx cglobal predict_16x16_v_mmx ;----------------------------------------------------------------------------- ; -; void predict_8x8_v_mmx( uint8_t *src, int i_stride ) +; void predict_8x8c_v_mmx( uint8_t *src, int i_stride ) ; ;----------------------------------------------------------------------------- ALIGN 16 -predict_8x8_v_mmx : +predict_8x8c_v_mmx : ;push edi ;push esi diff --git a/common/i386/predict.c b/common/i386/predict.c index cf5c26e9..4b55596f 100644 --- a/common/i386/predict.c +++ b/common/i386/predict.c @@ -177,7 +177,7 @@ static void predict_16x16_v( uint8_t *src, int i_stride ) /**************************************************************************** * 8x8 prediction for intra chroma block DC, H, V, P ****************************************************************************/ -static void predict_8x8_dc_128( uint8_t *src, int i_stride ) +static void predict_8x8c_dc_128( uint8_t *src, int i_stride ) { int y; @@ -191,7 +191,7 @@ static void predict_8x8_dc_128( uint8_t *src, int i_stride ) src += i_stride; } } -static void predict_8x8_dc_left( uint8_t *src, int i_stride ) +static void predict_8x8c_dc_left( uint8_t *src, int i_stride ) { int y; uint32_t dc0 = 0, dc1 = 0; @@ -222,7 +222,7 @@ static void predict_8x8_dc_left( uint8_t *src, int i_stride ) } } -static void predict_8x8_dc_top( uint8_t *src, int i_stride ) +static void predict_8x8c_dc_top( uint8_t *src, int i_stride ) { int y, x; uint32_t dc0 = 0, dc1 = 0; @@ -244,7 +244,7 @@ static void predict_8x8_dc_top( uint8_t *src, int i_stride ) src += i_stride; } } -static void predict_8x8_dc( uint8_t *src, int i_stride ) +static void predict_8x8c_dc( uint8_t *src, int i_stride ) { int y; int s0 = 0, s1 = 0, s2 = 0, s3 = 0; @@ -291,7 +291,7 @@ static void predict_8x8_dc( uint8_t *src, int i_stride ) } } -static void predict_8x8_h( uint8_t *src, int i_stride ) +static void predict_8x8c_h( uint8_t *src, int i_stride ) { int i; @@ -307,10 +307,10 @@ static void predict_8x8_h( uint8_t *src, int i_stride ) } } -extern void predict_8x8_v_mmx( uint8_t *src, int i_stride ); +extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride ); #if 0 -static void predict_8x8_v( uint8_t *src, int i_stride ) +static void predict_8x8c_v( uint8_t *src, int i_stride ) { int i; @@ -326,7 +326,7 @@ static void predict_8x8_v( uint8_t *src, int i_stride ) /**************************************************************************** - * 4x4 prediction for intra luma block DC, H, V, P + * 4x4 prediction for intra luma block ****************************************************************************/ static void predict_4x4_dc_128( uint8_t *src, int i_stride ) { @@ -422,14 +422,14 @@ void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] ) pf[I_PRED_16x16_DC_128 ]= predict_16x16_dc_128; } -void x264_predict_8x8_init_mmxext( x264_predict_t pf[7] ) +void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] ) { - pf[I_PRED_CHROMA_V ] = predict_8x8_v_mmx; - pf[I_PRED_CHROMA_H ] = predict_8x8_h; - pf[I_PRED_CHROMA_DC] = predict_8x8_dc; - pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left; - pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top; - pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128; + pf[I_PRED_CHROMA_V ] = predict_8x8c_v_mmx; + pf[I_PRED_CHROMA_H ] = predict_8x8c_h; + pf[I_PRED_CHROMA_DC] = predict_8x8c_dc; + pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left; + pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top; + pf[I_PRED_CHROMA_DC_128 ]= predict_8x8c_dc_128; } void x264_predict_4x4_init_mmxext( x264_predict_t pf[12] ) diff --git a/common/i386/predict.h b/common/i386/predict.h index b00b1e59..9cec1ed2 100644 --- a/common/i386/predict.h +++ b/common/i386/predict.h @@ -25,7 +25,7 @@ #define _I386_PREDICT_H 1 void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] ); -void x264_predict_8x8_init_mmxext ( x264_predict_t pf[7] ); +void x264_predict_8x8c_init_mmxext ( x264_predict_t pf[7] ); void x264_predict_4x4_init_mmxext ( x264_predict_t pf[12] ); #endif diff --git a/common/macroblock.c b/common/macroblock.c index 95772744..49804e58 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -54,23 +54,71 @@ static const int dequant_mf[6][4][4] = { {18, 23, 18, 23}, {23, 29, 23, 29}, {18, 23, 18, 23}, {23, 29, 23, 29} } }; -#if 0 -static const int i_chroma_qp_table[52] = +static const int dequant8_mf[6][8][8] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 29, 30, 31, 32, 32, 33, 34, 34, 35, 35, - 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, - 39, 39 + { + {20, 19, 25, 19, 20, 19, 25, 19}, + {19, 18, 24, 18, 19, 18, 24, 18}, + {25, 24, 32, 24, 25, 24, 32, 24}, + {19, 18, 24, 18, 19, 18, 24, 18}, + {20, 19, 25, 19, 20, 19, 25, 19}, + {19, 18, 24, 18, 19, 18, 24, 18}, + {25, 24, 32, 24, 25, 24, 32, 24}, + {19, 18, 24, 18, 19, 18, 24, 18} + }, { + {22, 21, 28, 21, 22, 21, 28, 21}, + {21, 19, 26, 19, 21, 19, 26, 19}, + {28, 26, 35, 26, 28, 26, 35, 26}, + {21, 19, 26, 19, 21, 19, 26, 19}, + {22, 21, 28, 21, 22, 21, 28, 21}, + {21, 19, 26, 19, 21, 19, 26, 19}, + {28, 26, 35, 26, 28, 26, 35, 26}, + {21, 19, 26, 19, 21, 19, 26, 19} + }, { + {26, 24, 33, 24, 26, 24, 33, 24}, + {24, 23, 31, 23, 24, 23, 31, 23}, + {33, 31, 42, 31, 33, 31, 42, 31}, + {24, 23, 31, 23, 24, 23, 31, 23}, + {26, 24, 33, 24, 26, 24, 33, 24}, + {24, 23, 31, 23, 24, 23, 31, 23}, + {33, 31, 42, 31, 33, 31, 42, 31}, + {24, 23, 31, 23, 24, 23, 31, 23} + }, { + {28, 26, 35, 26, 28, 26, 35, 26}, + {26, 25, 33, 25, 26, 25, 33, 25}, + {35, 33, 45, 33, 35, 33, 45, 33}, + {26, 25, 33, 25, 26, 25, 33, 25}, + {28, 26, 35, 26, 28, 26, 35, 26}, + {26, 25, 33, 25, 26, 25, 33, 25}, + {35, 33, 45, 33, 35, 33, 45, 33}, + {26, 25, 33, 25, 26, 25, 33, 25} + }, { + {32, 30, 40, 30, 32, 30, 40, 30}, + {30, 28, 38, 28, 30, 28, 38, 28}, + {40, 38, 51, 38, 40, 38, 51, 38}, + {30, 28, 38, 28, 30, 28, 38, 28}, + {32, 30, 40, 30, 32, 30, 40, 30}, + {30, 28, 38, 28, 30, 28, 38, 28}, + {40, 38, 51, 38, 40, 38, 51, 38}, + {30, 28, 38, 28, 30, 28, 38, 28} + }, { + {36, 34, 46, 34, 36, 34, 46, 34}, + {34, 32, 43, 32, 34, 32, 43, 32}, + {46, 43, 58, 43, 46, 43, 58, 43}, + {34, 32, 43, 32, 34, 32, 43, 32}, + {36, 34, 46, 34, 36, 34, 46, 34}, + {34, 32, 43, 32, 34, 32, 43, 32}, + {46, 43, 58, 43, 46, 43, 58, 43}, + {34, 32, 43, 32, 34, 32, 43, 32} + } }; -#endif int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ) { const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1]; const int mb = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 8]; - const int m = X264_MIN( ma, mb ); + const int m = X264_MIN( x264_mb_pred_mode4x4_fix(ma), + x264_mb_pred_mode4x4_fix(mb) ); if( m < 0 ) return I_PRED_4x4_DC; @@ -92,6 +140,24 @@ int x264_mb_predict_non_zero_code( x264_t *h, int idx ) return i_ret & 0x7f; } +int x264_mb_transform_8x8_allowed( x264_t *h, int i_mb_type ) +{ + int i; + if( i_mb_type == P_8x8 || i_mb_type == B_8x8 ) + { + for( i = 0; i < 4; i++ ) + if( !IS_SUB8x8(h->mb.i_sub_partition[i]) + || ( h->mb.i_sub_partition[i] == D_DIRECT_8x8 && !h->sps->b_direct8x8_inference ) ) + { + return 0; + } + } + if( i_mb_type == B_DIRECT && !h->sps->b_direct8x8_inference ) + return 0; + + return 1; +} + /**************************************************************************** * Scan and Quant functions ****************************************************************************/ @@ -166,6 +232,44 @@ void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale ) } } +void x264_mb_dequant_8x8( int16_t dct[8][8], int i_qscale ) +{ + const int i_mf = i_qscale%6; + int y; + + if( i_qscale >= 12 ) + { + const int i_qbits = (i_qscale/6) - 2; + for( y = 0; y < 8; y++ ) + { + dct[y][0] = ( dct[y][0] * dequant8_mf[i_mf][y][0] ) << i_qbits; + dct[y][1] = ( dct[y][1] * dequant8_mf[i_mf][y][1] ) << i_qbits; + dct[y][2] = ( dct[y][2] * dequant8_mf[i_mf][y][2] ) << i_qbits; + dct[y][3] = ( dct[y][3] * dequant8_mf[i_mf][y][3] ) << i_qbits; + dct[y][4] = ( dct[y][4] * dequant8_mf[i_mf][y][4] ) << i_qbits; + dct[y][5] = ( dct[y][5] * dequant8_mf[i_mf][y][5] ) << i_qbits; + dct[y][6] = ( dct[y][6] * dequant8_mf[i_mf][y][6] ) << i_qbits; + dct[y][7] = ( dct[y][7] * dequant8_mf[i_mf][y][7] ) << i_qbits; + } + } + else + { + const int i_qbits = 2 - (i_qscale/6); + const int i_round = i_qbits; // 1<<(i_qbits-1) + for( y = 0; y < 8; y++ ) + { + dct[y][0] = ( dct[y][0] * dequant8_mf[i_mf][y][0] + i_round ) >> i_qbits; + dct[y][1] = ( dct[y][1] * dequant8_mf[i_mf][y][1] + i_round ) >> i_qbits; + dct[y][2] = ( dct[y][2] * dequant8_mf[i_mf][y][2] + i_round ) >> i_qbits; + dct[y][3] = ( dct[y][3] * dequant8_mf[i_mf][y][3] + i_round ) >> i_qbits; + dct[y][4] = ( dct[y][4] * dequant8_mf[i_mf][y][4] + i_round ) >> i_qbits; + dct[y][5] = ( dct[y][5] * dequant8_mf[i_mf][y][5] + i_round ) >> i_qbits; + dct[y][6] = ( dct[y][6] * dequant8_mf[i_mf][y][6] + i_round ) >> i_qbits; + dct[y][7] = ( dct[y][7] * dequant8_mf[i_mf][y][7] + i_round ) >> i_qbits; + } + } +} + void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] ) { const int i8 = x264_scan8[idx]; @@ -829,9 +933,10 @@ void x264_macroblock_cache_init( x264_t *h ) h->mb.i_b8_stride = h->sps->i_mb_width * 2; h->mb.i_b4_stride = h->sps->i_mb_width * 4; - h->mb.qp = x264_malloc( i_mb_count * sizeof( int8_t) ); - h->mb.cbp = x264_malloc( i_mb_count * sizeof( int16_t) ); - h->mb.skipbp = x264_malloc( i_mb_count * sizeof( int8_t) ); + h->mb.qp = x264_malloc( i_mb_count * sizeof(int8_t) ); + h->mb.cbp = x264_malloc( i_mb_count * sizeof(int16_t) ); + h->mb.skipbp = x264_malloc( i_mb_count * sizeof(int8_t) ); + h->mb.mb_transform_size = x264_malloc( i_mb_count * sizeof(int8_t) ); /* 0 -> 3 top(4), 4 -> 6 : left(3) */ h->mb.intra4x4_pred_mode = x264_malloc( i_mb_count * 7 * sizeof( int8_t ) ); @@ -874,6 +979,7 @@ void x264_macroblock_cache_end( x264_t *h ) } x264_free( h->mb.intra4x4_pred_mode ); x264_free( h->mb.non_zero_count ); + x264_free( h->mb.mb_transform_size ); x264_free( h->mb.skipbp ); x264_free( h->mb.cbp ); x264_free( h->mb.qp ); @@ -1070,6 +1176,14 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) else h->mb.i_mb_type_topleft = -1; + if( h->param.analyse.b_transform_8x8 ) + { + h->mb.cache.transform_size[0] = (h->mb.i_neighbour&MB_LEFT) + && h->mb.mb_transform_size[i_left_xy]; + h->mb.cache.transform_size[1] = (h->mb.i_neighbour&MB_TOP) + && h->mb.mb_transform_size[i_top_xy]; + } + /* load ref/mv/mvd */ if( h->sh.i_type != SLICE_TYPE_I ) { @@ -1237,12 +1351,37 @@ void x264_macroblock_cache_load( x264_t *h, int i_mb_x, int i_mb_y ) } } } + + // FIXME skip this if I_4x4 and I_8x8 are disabled? + // assumes MB_TOPRIGHT = MB_TOP<<1 + h->mb.i_neighbour4[0] = + h->mb.i_neighbour8[0] = (h->mb.i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT)) + | ((h->mb.i_neighbour & MB_TOP) ? MB_TOPRIGHT : 0); + h->mb.i_neighbour4[4] = + h->mb.i_neighbour4[1] = MB_LEFT | ((h->mb.i_neighbour & MB_TOP) ? (MB_TOP|MB_TOPLEFT|MB_TOPRIGHT) : 0); + h->mb.i_neighbour4[2] = + h->mb.i_neighbour4[8] = + h->mb.i_neighbour4[10] = + h->mb.i_neighbour8[2] = MB_TOP|MB_TOPRIGHT | ((h->mb.i_neighbour & MB_LEFT) ? (MB_LEFT|MB_TOPLEFT) : 0); + h->mb.i_neighbour4[3] = + h->mb.i_neighbour4[7] = + h->mb.i_neighbour4[11] = + h->mb.i_neighbour4[13] = + h->mb.i_neighbour4[15] = + h->mb.i_neighbour8[3] = MB_LEFT|MB_TOP|MB_TOPLEFT; + h->mb.i_neighbour4[5] = + h->mb.i_neighbour8[1] = MB_LEFT | (h->mb.i_neighbour & MB_TOPRIGHT) + | ((h->mb.i_neighbour & MB_TOP) ? MB_TOP|MB_TOPLEFT : 0); + h->mb.i_neighbour4[6] = + h->mb.i_neighbour4[9] = + h->mb.i_neighbour4[12] = + h->mb.i_neighbour4[14] = MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT; } void x264_macroblock_cache_save( x264_t *h ) { const int i_mb_xy = h->mb.i_mb_xy; - const int i_mb_type = h->mb.i_type; + const int i_mb_type = x264_mb_type_fix[h->mb.i_type]; const int s8x8 = h->mb.i_b8_stride; const int s4x4 = h->mb.i_b4_stride; const int i_mb_4x4 = h->mb.i_b4_xy; @@ -1295,6 +1434,8 @@ void x264_macroblock_cache_save( x264_t *h ) } } + h->mb.mb_transform_size[i_mb_xy] = h->mb.b_transform_8x8; + if( !IS_INTRA( i_mb_type ) ) { int i_list; diff --git a/common/macroblock.h b/common/macroblock.h index 7f26a2c0..a3b55271 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -36,35 +36,43 @@ enum macroblock_position_e /* XXX mb_type isn't the one written in the bitstream -> only internal usage */ -#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_16x16 ) +#define IS_INTRA(type) ( (type) == I_4x4 || (type) == I_8x8 || (type) == I_16x16 ) #define IS_SKIP(type) ( (type) == P_SKIP || (type) == B_SKIP ) #define IS_DIRECT(type) ( (type) == B_DIRECT ) enum mb_class_e { I_4x4 = 0, - I_16x16 = 1, - I_PCM = 2, - - P_L0 = 3, - P_8x8 = 4, - P_SKIP = 5, - - B_DIRECT = 6, - B_L0_L0 = 7, - B_L0_L1 = 8, - B_L0_BI = 9, - B_L1_L0 = 10, - B_L1_L1 = 11, - B_L1_BI = 12, - B_BI_L0 = 13, - B_BI_L1 = 14, - B_BI_BI = 15, - B_8x8 = 16, - B_SKIP = 17, + I_8x8 = 1, + I_16x16 = 2, + I_PCM = 3, + + P_L0 = 4, + P_8x8 = 5, + P_SKIP = 6, + + B_DIRECT = 7, + B_L0_L0 = 8, + B_L0_L1 = 9, + B_L0_BI = 10, + B_L1_L0 = 11, + B_L1_L1 = 12, + B_L1_BI = 13, + B_BI_L0 = 14, + B_BI_L1 = 15, + B_BI_BI = 16, + B_8x8 = 17, + B_SKIP = 18, +}; +static const int x264_mb_type_fix[19] = +{ + I_4x4, I_4x4, I_16x16, I_PCM, + P_L0, P_8x8, P_SKIP, + B_DIRECT, B_L0_L0, B_L0_L1, B_L0_BI, B_L1_L0, B_L1_L1, + B_L1_BI, B_BI_L0, B_BI_L1, B_BI_BI, B_8x8, B_SKIP }; -static const int x264_mb_type_list0_table[18][2] = +static const int x264_mb_type_list0_table[19][2] = { - {0,0}, {0,0}, {0,0}, /* INTRA */ + {0,0}, {0,0}, {0,0}, {0,0}, /* INTRA */ {1,1}, /* P_L0 */ {0,0}, /* P_8x8 */ {1,1}, /* P_SKIP */ @@ -75,9 +83,9 @@ static const int x264_mb_type_list0_table[18][2] = {0,0}, /* B_8x8 */ {0,0} /* B_SKIP */ }; -static const int x264_mb_type_list1_table[18][2] = +static const int x264_mb_type_list1_table[19][2] = { - {0,0}, {0,0}, {0,0}, /* INTRA */ + {0,0}, {0,0}, {0,0}, {0,0}, /* INTRA */ {0,0}, /* P_L0 */ {0,0}, /* P_8x8 */ {0,0}, /* P_SKIP */ @@ -160,6 +168,7 @@ void x264_macroblock_bipred_init( x264_t *h ); void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int i_qscale ); void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int i_qscale ); void x264_mb_dequant_4x4( int16_t dct[4][4], int i_qscale ); +void x264_mb_dequant_8x8( int16_t dct[8][8], int i_qscale ); /* x264_mb_predict_mv_16x16: * set mvp with predicted mv for D_16x16 block @@ -192,8 +201,10 @@ void x264_mb_predict_mv_ref16x16( x264_t *h, int i_list, int i_ref, int mvc[5][2 int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ); int x264_mb_predict_non_zero_code( x264_t *h, int idx ); +int x264_mb_transform_8x8_allowed( x264_t *h, int i_mb_type ); void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale ); +void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale ); void x264_mb_mc( x264_t *h ); @@ -244,6 +255,11 @@ static inline void x264_macroblock_cache_skip( x264_t *h, int x, int y, int widt } } } +static inline void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x, int y, int i_mode ) +{ + int *cache = &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y]; + cache[0] = cache[1] = cache[8] = cache[9] = i_mode; +} #endif diff --git a/common/predict.c b/common/predict.c index 13c98603..5ce34e27 100644 --- a/common/predict.c +++ b/common/predict.c @@ -32,8 +32,8 @@ #include #include -#include "x264.h" -#include "predict.h" +#include "common.h" +#include "macroblock.h" #ifdef _MSC_VER #undef HAVE_MMXEXT /* not finished now */ @@ -197,7 +197,7 @@ static void predict_16x16_p( uint8_t *src, int i_stride ) /**************************************************************************** * 8x8 prediction for intra chroma block DC, H, V, P ****************************************************************************/ -static void predict_8x8_dc_128( uint8_t *src, int i_stride ) +static void predict_8x8c_dc_128( uint8_t *src, int i_stride ) { int x,y; @@ -210,7 +210,7 @@ static void predict_8x8_dc_128( uint8_t *src, int i_stride ) src += i_stride; } } -static void predict_8x8_dc_left( uint8_t *src, int i_stride ) +static void predict_8x8c_dc_left( uint8_t *src, int i_stride ) { int x,y; int dc0 = 0, dc1 = 0; @@ -233,7 +233,7 @@ static void predict_8x8_dc_left( uint8_t *src, int i_stride ) src += i_stride; } } -static void predict_8x8_dc_top( uint8_t *src, int i_stride ) +static void predict_8x8c_dc_top( uint8_t *src, int i_stride ) { int x,y; int dc0 = 0, dc1 = 0; @@ -256,7 +256,7 @@ static void predict_8x8_dc_top( uint8_t *src, int i_stride ) src += i_stride; } } -static void predict_8x8_dc( uint8_t *src, int i_stride ) +static void predict_8x8c_dc( uint8_t *src, int i_stride ) { int x,y; int s0 = 0, s1 = 0, s2 = 0, s3 = 0; @@ -297,7 +297,7 @@ static void predict_8x8_dc( uint8_t *src, int i_stride ) } } -static void predict_8x8_h( uint8_t *src, int i_stride ) +static void predict_8x8c_h( uint8_t *src, int i_stride ) { int i,j; @@ -314,7 +314,7 @@ static void predict_8x8_h( uint8_t *src, int i_stride ) src += i_stride; } } -static void predict_8x8_v( uint8_t *src, int i_stride ) +static void predict_8x8c_v( uint8_t *src, int i_stride ) { int i,j; @@ -327,7 +327,7 @@ static void predict_8x8_v( uint8_t *src, int i_stride ) } } -static void predict_8x8_p( uint8_t *src, int i_stride ) +static void predict_8x8c_p( uint8_t *src, int i_stride ) { int i; int x,y; @@ -362,7 +362,7 @@ static void predict_8x8_p( uint8_t *src, int i_stride ) } /**************************************************************************** - * 4x4 prediction for intra luma block DC, H, V, P + * 4x4 prediction for intra luma block ****************************************************************************/ static void predict_4x4_dc_128( uint8_t *src, int i_stride ) { @@ -638,6 +638,245 @@ static void predict_4x4_hu( uint8_t *src, int i_stride ) src[3*i_stride+3]= l3; } +/**************************************************************************** + * 8x8 prediction for intra luma block + ****************************************************************************/ + +#define SRC(x,y) src[(x)+(y)*i_stride] +#define PL(y) \ + const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; +#define PREDICT_8x8_LOAD_LEFT \ + const int l0 = ((i_neighbor&MB_TOPLEFT ? SRC(-1,-1) : SRC(-1,0)) \ + + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ + PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ + const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2; + +#define PT(x) \ + const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOP \ + const int t0 = ((i_neighbor&MB_TOPLEFT ? SRC(-1,-1) : SRC(0,-1)) \ + + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ + PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ + const int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \ + + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; \ + +#define PTR(x) \ + t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; +#define PREDICT_8x8_LOAD_TOPRIGHT \ + int t8, t9, t10, t11, t12, t13, t14, t15; \ + if(i_neighbor&MB_TOPRIGHT) { \ + PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \ + t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \ + } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); + +#define PREDICT_8x8_LOAD_TOPLEFT \ + const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2; + +#define PREDICT_8x8_DC(v) \ + int y; \ + for( y = 0; y < 8; y++ ) { \ + ((uint32_t*)src)[0] = \ + ((uint32_t*)src)[1] = v; \ + src += i_stride; \ + } + +static void predict_8x8_dc_128( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_DC(0x80808080); +} +static void predict_8x8_dc_left( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_LEFT; + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101; + PREDICT_8x8_DC(dc); +} +static void predict_8x8_dc_top( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_TOP; + const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101; + PREDICT_8x8_DC(dc); +} +static void predict_8x8_dc( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOP; + const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7 + +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101; + PREDICT_8x8_DC(dc); +} +static void predict_8x8_h( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_LEFT; +#define ROW(y) ((uint32_t*)(src+y*i_stride))[0] =\ + ((uint32_t*)(src+y*i_stride))[1] = 0x01010101U * l##y + ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7); +#undef ROW +} +static void predict_8x8_v( uint8_t *src, int i_stride, int i_neighbor ) +{ + int y; + PREDICT_8x8_LOAD_TOP; + src[0] = t0; + src[1] = t1; + src[2] = t2; + src[3] = t3; + src[4] = t4; + src[5] = t5; + src[6] = t6; + src[7] = t7; + for( y = 1; y < 8; y++ ) + *(uint64_t*)(src+y*i_stride) = *(uint64_t*)src; +} +static void predict_8x8_ddl( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2; + SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2; + SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2; + SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2; + SRC(7,7)= (t14 + 3*t15 + 2) >> 2; +} +static void predict_8x8_ddr( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2; + SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2; + +} +static void predict_8x8_vr( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + /* produce warning as l7 is unused */ + SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2; + SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2; + SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2; + SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2; + SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2; + SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2; + SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1; + SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2; + SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1; + SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1; + SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1; + SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1; + SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1; + SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1; + SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(7,0)= (t6 + t7 + 1) >> 1; +} +static void predict_8x8_hd( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_LEFT; + PREDICT_8x8_LOAD_TOPLEFT; + /* produce warning as t7 is unused */ + SRC(0,7)= (l6 + l7 + 1) >> 1; + SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1; + SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1; + SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1; + SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1; + SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1; + SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1; + SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2; + SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1; + SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2; + SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2; + SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2; + SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2; + SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2; + SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2; + SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2; +} +static void predict_8x8_vl( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_TOP; + PREDICT_8x8_LOAD_TOPRIGHT; + SRC(0,0)= (t0 + t1 + 1) >> 1; + SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2; + SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1; + SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2; + SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1; + SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2; + SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1; + SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2; + SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1; + SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2; + SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1; + SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2; + SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1; + SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2; + SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1; + SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2; + SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1; + SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2; + SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1; + SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2; + SRC(7,6)= (t10 + t11 + 1) >> 1; + SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2; +} +static void predict_8x8_hu( uint8_t *src, int i_stride, int i_neighbor ) +{ + PREDICT_8x8_LOAD_LEFT; + SRC(0,0)= (l0 + l1 + 1) >> 1; + SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2; + SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1; + SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2; + SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1; + SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2; + SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1; + SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2; + SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1; + SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2; + SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1; + SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2; + SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1; + SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2; + SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)= + SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)= + SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)= + SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7; +} + /**************************************************************************** * Exported functions: ****************************************************************************/ @@ -659,24 +898,40 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] ) #endif } -void x264_predict_8x8_init( int cpu, x264_predict_t pf[7] ) +void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) { - pf[I_PRED_CHROMA_V ] = predict_8x8_v; - pf[I_PRED_CHROMA_H ] = predict_8x8_h; - pf[I_PRED_CHROMA_DC] = predict_8x8_dc; - pf[I_PRED_CHROMA_P ] = predict_8x8_p; - pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8_dc_left; - pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8_dc_top; - pf[I_PRED_CHROMA_DC_128 ]= predict_8x8_dc_128; + pf[I_PRED_CHROMA_V ] = predict_8x8c_v; + pf[I_PRED_CHROMA_H ] = predict_8x8c_h; + pf[I_PRED_CHROMA_DC] = predict_8x8c_dc; + pf[I_PRED_CHROMA_P ] = predict_8x8c_p; + pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left; + pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top; + pf[I_PRED_CHROMA_DC_128 ]= predict_8x8c_dc_128; #ifdef HAVE_MMXEXT if( cpu&X264_CPU_MMXEXT ) { - x264_predict_8x8_init_mmxext( pf ); + x264_predict_8x8c_init_mmxext( pf ); } #endif } +void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] ) +{ + pf[I_PRED_8x8_V] = predict_8x8_v; + pf[I_PRED_8x8_H] = predict_8x8_h; + pf[I_PRED_8x8_DC] = predict_8x8_dc; + pf[I_PRED_8x8_DDL] = predict_8x8_ddl; + pf[I_PRED_8x8_DDR] = predict_8x8_ddr; + pf[I_PRED_8x8_VR] = predict_8x8_vr; + pf[I_PRED_8x8_HD] = predict_8x8_hd; + pf[I_PRED_8x8_VL] = predict_8x8_vl; + pf[I_PRED_8x8_HU] = predict_8x8_hu; + pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left; + pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top; + pf[I_PRED_8x8_DC_128] = predict_8x8_dc_128; +} + void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] ) { pf[I_PRED_4x4_V] = predict_4x4_v; diff --git a/common/predict.h b/common/predict.h index 988e57fb..3a9554d7 100644 --- a/common/predict.h +++ b/common/predict.h @@ -25,6 +25,7 @@ #define _PREDICT_H 1 typedef void (*x264_predict_t)( uint8_t *src, int i_stride ); +typedef void (*x264_predict8x8_t)( uint8_t *src, int i_stride, int i_neighbor ); enum intra_chroma_pred_e { @@ -37,7 +38,7 @@ enum intra_chroma_pred_e I_PRED_CHROMA_DC_TOP = 5, I_PRED_CHROMA_DC_128 = 6 }; -static const int x264_mb_pred_mode8x8_fix[7] = +static const int x264_mb_pred_mode8x8c_fix[7] = { I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P, I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC @@ -76,17 +77,38 @@ enum intra4x4_pred_e I_PRED_4x4_DC_TOP = 10, I_PRED_4x4_DC_128 = 11, }; -static const int x264_mb_pred_mode4x4_fix[12] = +static const int x264_mb_pred_mode4x4_fix[13] = { + -1, I_PRED_4x4_V, I_PRED_4x4_H, I_PRED_4x4_DC, I_PRED_4x4_DDL, I_PRED_4x4_DDR, I_PRED_4x4_VR, I_PRED_4x4_HD, I_PRED_4x4_VL, I_PRED_4x4_HU, I_PRED_4x4_DC, I_PRED_4x4_DC, I_PRED_4x4_DC }; +#define x264_mb_pred_mode4x4_fix(t) x264_mb_pred_mode4x4_fix[(t)+1] + +/* must use the same numbering as intra4x4_pred_e */ +enum intra8x8_pred_e +{ + I_PRED_8x8_V = 0, + I_PRED_8x8_H = 1, + I_PRED_8x8_DC = 2, + I_PRED_8x8_DDL= 3, + I_PRED_8x8_DDR= 4, + I_PRED_8x8_VR = 5, + I_PRED_8x8_HD = 6, + I_PRED_8x8_VL = 7, + I_PRED_8x8_HU = 8, + + I_PRED_8x8_DC_LEFT = 9, + I_PRED_8x8_DC_TOP = 10, + I_PRED_8x8_DC_128 = 11, +}; void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] ); -void x264_predict_8x8_init ( int cpu, x264_predict_t pf[7] ); +void x264_predict_8x8c_init ( int cpu, x264_predict_t pf[7] ); void x264_predict_4x4_init ( int cpu, x264_predict_t pf[12] ); +void x264_predict_8x8_init ( int cpu, x264_predict8x8_t pf[12] ); #endif diff --git a/common/set.h b/common/set.h index d559902f..3893bf64 100644 --- a/common/set.h +++ b/common/set.h @@ -28,7 +28,11 @@ enum profile_e { PROFILE_BASELINE = 66, PROFILE_MAIN = 77, - PROFILE_EXTENTED = 88 + PROFILE_EXTENTED = 88, + PROFILE_HIGH = 100, + PROFILE_HIGH10 = 110, + PROFILE_HIGH422 = 122, + PROFILE_HIGH444 = 144 }; typedef struct @@ -133,6 +137,9 @@ typedef struct int b_deblocking_filter_control; int b_constrained_intra_pred; int b_redundant_pic_cnt; + + int b_transform_8x8_mode; + } x264_pps_t; #endif diff --git a/common/visualize.c b/common/visualize.c index 78f2d91f..053c1b1b 100644 --- a/common/visualize.c +++ b/common/visualize.c @@ -147,6 +147,7 @@ void x264_visualize_show( x264_t *h ) static const stringlist_t mb_types[] = { /* Block types marked as NULL will not be drawn */ { I_4x4 , "red" }, + { I_8x8 , "#ff5640" }, { I_16x16 , "#ff8060" }, { I_PCM , "violet" }, { P_L0 , "SlateBlue" }, @@ -256,7 +257,7 @@ void x264_visualize_show( x264_t *h ) } } - if (v->i_type==I_4x4 || v->i_type==I_16x16 || v->i_type==I_PCM) { + if (IS_INTRA(v->i_type) || v->i_type==I_PCM) { /* Intra coded */ if (v->i_type==I_16x16) { switch (v->i_intra16x16_pred_mode) { @@ -278,42 +279,44 @@ void x264_visualize_show( x264_t *h ) break; } } - if (v->i_type==I_4x4) { - for (i=0; i<4; i++) for (j=0; j<4; j++) { + if (v->i_type==I_4x4 || v->i_type==I_8x8) { + const int di = v->i_type==I_8x8 ? 2 : 1; + const int zoom2 = zoom * di; + for (i=0; i<4; i+=di) for (j=0; j<4; j+=di) { const int x0 = x + j*4*zoom; const int y0 = y + i*4*zoom; - if (drawbox) disp_rect(0, x0, y0, x0+4*zoom, y0+4*zoom); + if (drawbox) disp_rect(0, x0, y0, x0+4*zoom2, y0+4*zoom2); switch (v->intra4x4_pred_mode[i][j]) { case I_PRED_4x4_V: /* Vertical */ - disp_line(0, x0+0*zoom, y0+1*zoom, x0+4*zoom, y0+1*zoom); + disp_line(0, x0+0*zoom2, y0+1*zoom2, x0+4*zoom2, y0+1*zoom2); break; case I_PRED_4x4_H: /* Horizontal */ - disp_line(0, x0+1*zoom, y0+0*zoom, x0+1*zoom, y0+4*zoom); + disp_line(0, x0+1*zoom2, y0+0*zoom2, x0+1*zoom2, y0+4*zoom2); break; case I_PRED_4x4_DC: /* DC, average from top and left sides */ case I_PRED_4x4_DC_LEFT: case I_PRED_4x4_DC_TOP: case I_PRED_4x4_DC_128: - disp_line(0, x0+1*zoom, y0+1*zoom, x0+4*zoom, y0+1*zoom); - disp_line(0, x0+1*zoom, y0+1*zoom, x0+1*zoom, y0+4*zoom); + disp_line(0, x0+1*zoom2, y0+1*zoom2, x0+4*zoom2, y0+1*zoom2); + disp_line(0, x0+1*zoom2, y0+1*zoom2, x0+1*zoom2, y0+4*zoom2); break; case I_PRED_4x4_DDL: /* Topright-bottomleft */ - disp_line(0, x0+0*zoom, y0+0*zoom, x0+4*zoom, y0+4*zoom); + disp_line(0, x0+0*zoom2, y0+0*zoom2, x0+4*zoom2, y0+4*zoom2); break; case I_PRED_4x4_DDR: /* Topleft-bottomright */ - disp_line(0, x0+0*zoom, y0+4*zoom, x0+4*zoom, y0+0*zoom); + disp_line(0, x0+0*zoom2, y0+4*zoom2, x0+4*zoom2, y0+0*zoom2); break; case I_PRED_4x4_VR: /* Mix of topleft-bottomright and vertical */ - disp_line(0, x0+0*zoom, y0+2*zoom, x0+4*zoom, y0+1*zoom); + disp_line(0, x0+0*zoom2, y0+2*zoom2, x0+4*zoom2, y0+1*zoom2); break; case I_PRED_4x4_HD: /* Mix of topleft-bottomright and horizontal */ - disp_line(0, x0+2*zoom, y0+0*zoom, x0+1*zoom, y0+4*zoom); + disp_line(0, x0+2*zoom2, y0+0*zoom2, x0+1*zoom2, y0+4*zoom2); break; case I_PRED_4x4_VL: /* Mix of topright-bottomleft and vertical */ - disp_line(0, x0+0*zoom, y0+1*zoom, x0+4*zoom, y0+2*zoom); + disp_line(0, x0+0*zoom2, y0+1*zoom2, x0+4*zoom2, y0+2*zoom2); break; case I_PRED_4x4_HU: /* Mix of topright-bottomleft and horizontal */ - disp_line(0, x0+1*zoom, y0+0*zoom, x0+2*zoom, y0+4*zoom); + disp_line(0, x0+1*zoom2, y0+0*zoom2, x0+2*zoom2, y0+4*zoom2); break; } } diff --git a/encoder/analyse.c b/encoder/analyse.c index d9a58d1d..99238651 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -77,16 +77,19 @@ typedef struct /* Take some shortcuts in intra search if intra is deemed unlikely */ int b_fast_intra; - /* Luma part 16x16 and 4x4 modes stats */ + /* Luma part */ int i_sad_i16x16; int i_predict16x16; + int i_sad_i8x8; + int i_predict8x8[2][2]; + int i_sad_i4x4; int i_predict4x4[4][4]; /* Chroma part */ - int i_sad_i8x8; - int i_predict8x8; + int i_sad_i8x8chroma; + int i_predict8x8chroma; /* II: Inter part P/B frame */ x264_mb_analysis_list_t l0; @@ -126,8 +129,8 @@ static const uint8_t block_idx_y[16] = { }; /* TODO: calculate CABAC costs */ -static const int i_mb_b_cost_table[18] = { - 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 +static const int i_mb_b_cost_table[19] = { + 9, 9, 9, 9, 0, 0, 0, 1, 3, 7, 7, 7, 3, 7, 7, 7, 5, 9, 0 }; static const int i_mb_b16x8_cost_table[16] = { 0, 0, 0, 0, 0, 0, 0, 5, 7, 7, 7, 5, 7, 9, 9, 9 @@ -175,10 +178,13 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) && h->mb.i_subpel_refine >= 5; a->b_fast_intra = 0; + h->mb.b_transform_8x8 = 0; + /* I: Intra part */ a->i_sad_i16x16 = + a->i_sad_i8x8 = a->i_sad_i4x4 = - a->i_sad_i8x8 = COST_MAX; + a->i_sad_i8x8chroma = COST_MAX; /* II: Inter part P/B frame */ if( h->sh.i_type != SLICE_TYPE_I ) @@ -244,7 +250,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp ) || IS_INTRA( h->mb.i_mb_type_topleft ) || IS_INTRA( h->mb.i_mb_type_topright ) || (h->sh.i_type == SLICE_TYPE_P && IS_INTRA( h->fref0[0]->mb_type[h->mb.i_mb_xy] )) - || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_16x16])) ) + || (h->mb.i_mb_xy - h->sh.i_first_mb < 3*(h->stat.frame.i_mb_count[I_4x4] + h->stat.frame.i_mb_count[I_8x8] + h->stat.frame.i_mb_count[I_16x16])) ) { /* intra is likely */ } else { @@ -294,7 +300,7 @@ static void predict_16x16_mode_available( unsigned int i_neighbour, int *mode, i } /* Max = 4 */ -static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int *pi_count ) +static void predict_8x8chroma_mode_available( unsigned int i_neighbour, int *mode, int *pi_count ) { if( i_neighbour & MB_TOPLEFT ) { @@ -327,30 +333,18 @@ static void predict_8x8_mode_available( unsigned int i_neighbour, int *mode, int } } -/* MAX = 8 */ -static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int *mode, int *pi_count ) +/* MAX = 9 */ +static void predict_4x4_mode_available( unsigned int i_neighbour, + int *mode, int *pi_count ) { - int b_a, b_b, b_c; - static const unsigned int needmb[16] = - { - MB_LEFT|MB_TOP, MB_TOP, - MB_LEFT, MB_PRIVATE, - MB_TOP, MB_TOP|MB_TOPRIGHT, - 0, MB_PRIVATE, - MB_LEFT, 0, - MB_LEFT, MB_PRIVATE, - 0, MB_PRIVATE, - 0, MB_PRIVATE - }; - - /* FIXME even when b_c == 0 there is some case where missing pixels + /* FIXME even when b_tr == 0 there is some case where missing pixels * are emulated and thus more mode are available TODO * analysis and encode should be fixed too */ - b_a = (needmb[idx]&i_neighbour&MB_LEFT) == (needmb[idx]&MB_LEFT); - b_b = (needmb[idx]&i_neighbour&MB_TOP) == (needmb[idx]&MB_TOP); - b_c = (needmb[idx]&i_neighbour&(MB_TOPRIGHT|MB_PRIVATE)) == (needmb[idx]&(MB_TOPRIGHT|MB_PRIVATE)); + int b_l = i_neighbour & MB_LEFT; + int b_t = i_neighbour & MB_TOP; + int b_tr = i_neighbour & MB_TOPRIGHT; - if( b_a && b_b ) + if( b_l && b_t ) { *mode++ = I_PRED_4x4_DC; *mode++ = I_PRED_4x4_H; @@ -359,24 +353,16 @@ static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int * *mode++ = I_PRED_4x4_VR; *mode++ = I_PRED_4x4_HD; *mode++ = I_PRED_4x4_HU; - *pi_count = 7; - - if( b_c ) - { - *mode++ = I_PRED_4x4_DDL; - *mode++ = I_PRED_4x4_VL; - (*pi_count) += 2; - } } - else if( b_a && !b_b ) + else if( b_l ) { *mode++ = I_PRED_4x4_DC_LEFT; *mode++ = I_PRED_4x4_H; *mode++ = I_PRED_4x4_HU; *pi_count = 3; } - else if( !b_a && b_b ) + else if( b_t ) { *mode++ = I_PRED_4x4_DC_TOP; *mode++ = I_PRED_4x4_V; @@ -387,6 +373,13 @@ static void predict_4x4_mode_available( unsigned int i_neighbour, int idx, int * *mode++ = I_PRED_4x4_DC_128; *pi_count = 1; } + + if( b_t && b_tr ) + { + *mode++ = I_PRED_4x4_DDL; + *mode++ = I_PRED_4x4_VL; + (*pi_count) += 2; + } } static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cost_inter ) @@ -455,7 +448,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos p_dst_by = p_dst + 4 * x + 4 * y * i_stride; i_best = COST_MAX; - predict_4x4_mode_available( h->mb.i_neighbour, idx, predict_mode, &i_max ); + predict_4x4_mode_available( h->mb.i_neighbour4[idx], predict_mode, &i_max ); for( i = 0; i < i_max; i++ ) { int i_sad; @@ -481,18 +474,69 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *res, int i_cos } res->i_sad_i4x4 += i_best; - /* we need to encode this mb now (for next ones) */ + /* we need to encode this block now (for next ones) */ h->predict_4x4[res->i_predict4x4[x][y]]( p_dst_by, i_stride ); x264_mb_encode_i4x4( h, idx, res->i_qp ); - /* we need to store the 'fixed' version */ - h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = - x264_mb_pred_mode4x4_fix[res->i_predict4x4[x][y]]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = res->i_predict4x4[x][y]; } res->i_sad_i4x4 += res->i_lambda * 24; /* from JVT (SATD0) */ if( h->sh.i_type == SLICE_TYPE_B ) res->i_sad_i4x4 += res->i_lambda * i_mb_b_cost_table[I_4x4]; } + + /* 8x8 prediction selection */ + if( flags & X264_ANALYSE_I8x8 ) + { + res->i_sad_i8x8 = 0; + for( idx = 0; idx < 4; idx++ ) + { + uint8_t *p_src_by; + uint8_t *p_dst_by; + int i_best; + int x, y; + int i_pred_mode; + + i_pred_mode= x264_mb_predict_intra4x4_mode( h, 4*idx ); + x = idx&1; + y = idx>>1; + + p_src_by = p_src + 8 * x + 8 * y * i_stride; + p_dst_by = p_dst + 8 * x + 8 * y * i_stride; + + i_best = COST_MAX; + predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); + for( i = 0; i < i_max; i++ ) + { + int i_sad; + int i_mode; + + i_mode = predict_mode[i]; + h->predict_8x8[i_mode]( p_dst_by, i_stride, h->mb.i_neighbour ); + + i_sad = h->pixf.satd[PIXEL_8x8]( p_dst_by, i_stride, + p_src_by, i_stride ); + + i_sad += res->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix[i_mode] ? 1 : 4); + + if( i_best > i_sad ) + { + res->i_predict8x8[x][y] = i_mode; + i_best = i_sad; + } + } + res->i_sad_i8x8 += i_best; + + /* we need to encode this block now (for next ones) */ + h->predict_8x8[res->i_predict8x8[x][y]]( p_dst_by, i_stride, h->mb.i_neighbour ); + x264_mb_encode_i8x8( h, idx, res->i_qp ); + + x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, res->i_predict4x4[x][y] ); + } +// res->i_sad_i8x8 += res->i_lambda * something; // FIXME + if( h->sh.i_type == SLICE_TYPE_B ) + res->i_sad_i8x8 += res->i_lambda * i_mb_b_cost_table[I_8x8]; + } } static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res ) @@ -505,7 +549,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res ) uint8_t *p_dstc[2], *p_srcc[2]; int i_stride[2]; - if( res->i_sad_i8x8 < COST_MAX ) + if( res->i_sad_i8x8chroma < COST_MAX ) return; /* 8x8 prediction selection for chroma */ @@ -517,8 +561,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res ) i_stride[0] = h->mb.pic.i_stride[1]; i_stride[1] = h->mb.pic.i_stride[2]; - predict_8x8_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); - res->i_sad_i8x8 = COST_MAX; + predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); + res->i_sad_i8x8chroma = COST_MAX; for( i = 0; i < i_max; i++ ) { int i_sad; @@ -527,23 +571,25 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *res ) i_mode = predict_mode[i]; /* we do the prediction */ - h->predict_8x8[i_mode]( p_dstc[0], i_stride[0] ); - h->predict_8x8[i_mode]( p_dstc[1], i_stride[1] ); + h->predict_8x8c[i_mode]( p_dstc[0], i_stride[0] ); + h->predict_8x8c[i_mode]( p_dstc[1], i_stride[1] ); /* we calculate the cost */ i_sad = h->pixf.satd[PIXEL_8x8]( p_dstc[0], i_stride[0], p_srcc[0], i_stride[0] ) + h->pixf.satd[PIXEL_8x8]( p_dstc[1], i_stride[1], p_srcc[1], i_stride[1] ) + - res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8_fix[i_mode] ); + res->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] ); /* if i_score is lower it is better */ - if( res->i_sad_i8x8 > i_sad ) + if( res->i_sad_i8x8chroma > i_sad ) { - res->i_predict8x8 = i_mode; - res->i_sad_i8x8 = i_sad; + res->i_predict8x8chroma = i_mode; + res->i_sad_i8x8chroma = i_sad; } } + + h->mb.i_chroma_pred_mode = res->i_predict8x8chroma; } #define LOAD_FENC( m, src, xoff, yoff) \ @@ -1316,12 +1362,18 @@ void x264_macroblock_analyse( x264_t *h ) /*--------------------------- Do the analysis ---------------------------*/ if( h->sh.i_type == SLICE_TYPE_I ) { + int i_cost; x264_mb_analyse_intra( h, &analysis, COST_MAX ); - if( analysis.i_sad_i4x4 < analysis.i_sad_i16x16 ) + i_cost = analysis.i_sad_i16x16; + h->mb.i_type = I_16x16; + if( analysis.i_sad_i4x4 < i_cost ) + { + i_cost = analysis.i_sad_i4x4; h->mb.i_type = I_4x4; - else - h->mb.i_type = I_16x16; + } + if( analysis.i_sad_i8x8 < i_cost ) + h->mb.i_type = I_8x8; } else if( h->sh.i_type == SLICE_TYPE_P ) { @@ -1493,8 +1545,8 @@ void x264_macroblock_analyse( x264_t *h ) || ( analysis.i_sad_i4x4 < i_cost ))) { x264_mb_analyse_intra_chroma( h, &analysis ); - analysis.i_sad_i16x16 += analysis.i_sad_i8x8; - analysis.i_sad_i4x4 += analysis.i_sad_i8x8; + analysis.i_sad_i16x16 += analysis.i_sad_i8x8chroma; + analysis.i_sad_i4x4 += analysis.i_sad_i8x8chroma; } i_intra_type = I_16x16; @@ -1697,7 +1749,7 @@ void x264_macroblock_analyse( x264_t *h ) } /*-------------------- Update MB from the analysis ----------------------*/ - h->mb.type[h->mb.i_mb_xy] = h->mb.i_type; + h->mb.type[h->mb.i_mb_xy] = x264_mb_type_fix[h->mb.i_type]; switch( h->mb.i_type ) { case I_4x4: @@ -1708,13 +1760,18 @@ void x264_macroblock_analyse( x264_t *h ) } x264_mb_analyse_intra_chroma( h, &analysis ); - h->mb.i_chroma_pred_mode = analysis.i_predict8x8; + break; + case I_8x8: + h->mb.b_transform_8x8 = 1; + for( i = 0; i < 4; i++ ) + x264_macroblock_cache_intra8x8_pred( h, 2*(i&1), 2*(i>>1), + analysis.i_predict8x8[i&1][i>>1] ); + + x264_mb_analyse_intra_chroma( h, &analysis ); break; case I_16x16: h->mb.i_intra16x16_pred_mode = analysis.i_predict16x16; - x264_mb_analyse_intra_chroma( h, &analysis ); - h->mb.i_chroma_pred_mode = analysis.i_predict8x8; break; case P_L0: diff --git a/encoder/cabac.c b/encoder/cabac.c index c2cae3dd..257191f0 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -47,7 +47,7 @@ static const uint8_t block_idx_xy[4][4] = static inline void x264_cabac_mb_type_intra( x264_t *h, int i_mb_type, int ctx0, int ctx1, int ctx2, int ctx3, int ctx4, int ctx5 ) { - if( i_mb_type == I_4x4 ) + if( i_mb_type == I_4x4 || i_mb_type == I_8x8 ) { x264_cabac_encode_decision( &h->cabac, ctx0, 0 ); } @@ -78,7 +78,7 @@ static inline void x264_cabac_mb_type_intra( x264_t *h, int i_mb_type, static void x264_cabac_mb_type( x264_t *h ) { - const int i_mb_type = h->mb.i_type; + const int i_mb_type = x264_mb_type_fix[h->mb.i_type]; if( h->sh.i_type == SLICE_TYPE_I ) { @@ -268,7 +268,8 @@ static void x264_cabac_mb_intra4x4_pred_mode( x264_t *h, int i_pred, int i_mode x264_cabac_encode_decision( &h->cabac, 69, (i_mode >> 2)&0x01 ); } } -static void x264_cabac_mb_intra8x8_pred_mode( x264_t *h ) + +static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h ) { const int i_mode = h->mb.i_chroma_pred_mode; int ctx = 0; @@ -554,6 +555,13 @@ static inline void x264_cabac_mb_sub_b_partition( x264_t *h, int i_sub ) } } +static inline void x264_cabac_mb_transform_size( x264_t *h ) +{ + int ctx = ( h->mb.cache.transform_size[0] == 1 ) + + ( h->mb.cache.transform_size[1] == 1 ); + x264_cabac_encode_decision( &h->cabac, 399 + ctx, h->mb.b_transform_8x8 ); +} + static inline void x264_cabac_mb_ref( x264_t *h, int i_list, int idx ) { const int i8 = x264_scan8[idx]; @@ -818,12 +826,24 @@ static int x264_cabac_mb_cbf_ctxidxinc( x264_t *h, int i_cat, int i_idx ) static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, int *l, int i_count ) { - static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 }; - static const int last_significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 }; - static const int coeff_abs_level_m1_offset[5] = { 0, 10, 20, 30, 39 }; - - int i_coeff_abs_m1[16]; - int i_coeff_sign[16]; + static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 }; + static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 }; + static const int coeff_abs_level_m1_offset[6] = { 0, 10, 20, 30, 39, 199 }; + static const int significant_coeff_flag_offset_8x8[63] = { + 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5, + 4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7, + 7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11, + 12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 + }; + static const int last_significant_coeff_flag_offset_8x8[63] = { + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, + 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 + }; + + int i_coeff_abs_m1[64]; + int i_coeff_sign[64]; int i_coeff = 0; int i_last = 0; @@ -837,46 +857,50 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, * 2-> Luma4x4 i_idx = luma4x4idx * 3-> DC Chroma i_idx = iCbCr * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx + * 5-> Luma8x8 i_idx = luma8x8idx */ - //fprintf( stderr, "l[] = " ); for( i = 0; i < i_count; i++ ) { - //fprintf( stderr, "%d ", l[i] ); if( l[i] != 0 ) { i_coeff_abs_m1[i_coeff] = abs( l[i] ) - 1; - i_coeff_sign[i_coeff] = ( l[i] < 0 ? 1 : 0); + i_coeff_sign[i_coeff] = ( l[i] < 0 ); i_coeff++; i_last = i; } } - //fprintf( stderr, "\n" ); - if( i_coeff == 0 ) + if( i_count != 64 ) { - /* codec block flag */ - x264_cabac_encode_decision( &h->cabac, 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 0 ); - return; + /* coded block flag */ + x264_cabac_encode_decision( &h->cabac, 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), i_coeff != 0 ); + if( i_coeff == 0 ) + return; } - /* block coded */ - x264_cabac_encode_decision( &h->cabac, 85 + x264_cabac_mb_cbf_ctxidxinc( h, i_ctxBlockCat, i_idx ), 1 ); for( i = 0; i < i_count - 1; i++ ) { - int i_ctxIdxInc; + int i_sig_ctxIdxInc; + int i_last_ctxIdxInc; - i_ctxIdxInc = X264_MIN( i, i_count - 2 ); + if( i_ctxBlockCat == 5 ) + { + i_sig_ctxIdxInc = significant_coeff_flag_offset_8x8[i]; + i_last_ctxIdxInc = last_significant_coeff_flag_offset_8x8[i]; + } + else + i_sig_ctxIdxInc = i_last_ctxIdxInc = i; if( l[i] != 0 ) { - x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 1 ); - x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, i == i_last ? 1 : 0 ); + x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 1 ); + x264_cabac_encode_decision( &h->cabac, 166 + last_significant_coeff_flag_offset[i_ctxBlockCat] + i_last_ctxIdxInc, i == i_last ? 1 : 0 ); } else { - x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_ctxIdxInc, 0 ); + x264_cabac_encode_decision( &h->cabac, 105 + significant_coeff_flag_offset[i_ctxBlockCat] + i_sig_ctxIdxInc, 0 ); } if( i == i_last ) { @@ -905,13 +929,9 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, x264_cabac_encode_decision( &h->cabac, 227 + i_ctxIdxInc, 1 ); i_ctxIdxInc = 5 + X264_MIN( 4, i_abslevelgt1 ) + coeff_abs_level_m1_offset[i_ctxBlockCat]; for( j = 0; j < i_prefix - 1; j++ ) - { x264_cabac_encode_decision( &h->cabac, 227 + i_ctxIdxInc, 1 ); - } if( i_prefix < 14 ) - { x264_cabac_encode_decision( &h->cabac, 227 + i_ctxIdxInc, 0 ); - } } /* suffix */ if( i_coeff_abs_m1[i] >= 14 ) @@ -927,23 +947,16 @@ static void block_residual_write_cabac( x264_t *h, int i_ctxBlockCat, int i_idx, } x264_cabac_encode_bypass( &h->cabac, 0 ); while( k-- ) - { x264_cabac_encode_bypass( &h->cabac, (i_suffix >> k)&0x01 ); - } } /* write sign */ x264_cabac_encode_bypass( &h->cabac, i_coeff_sign[i] ); - if( i_coeff_abs_m1[i] == 0 ) - { i_abslevel1++; - } else - { i_abslevelgt1++; - } } } @@ -992,17 +1005,21 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s ) if( IS_INTRA( i_mb_type ) ) { - /* Prediction */ - if( i_mb_type == I_4x4 ) + if( h->pps->b_transform_8x8_mode && i_mb_type != I_16x16 ) + x264_cabac_mb_transform_size( h ); + + if( i_mb_type != I_16x16 ) { - for( i = 0; i < 16; i++ ) + int di = (i_mb_type == I_8x8) ? 4 : 1; + for( i = 0; i < 16; i += di ) { const int i_pred = x264_mb_predict_intra4x4_mode( h, i ); const int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]; x264_cabac_mb_intra4x4_pred_mode( h, i_pred, i_mode ); } } - x264_cabac_mb_intra8x8_pred_mode( h ); + + x264_cabac_mb_intra_chroma_pred_mode( h ); } else if( i_mb_type == P_L0 ) { @@ -1068,12 +1085,8 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s ) if( ( i_list ? h->sh.i_num_ref_idx_l1_active : h->sh.i_num_ref_idx_l0_active ) == 1 ) continue; for( i = 0; i < 4; i++ ) - { if( x264_mb_partition_listX_table[i_list][ h->mb.i_sub_partition[i] ] ) - { x264_cabac_mb_ref( h, i_list, 4*i ); - } - } } x264_cabac_mb8x8_mvd( h, 0 ); @@ -1141,6 +1154,12 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s ) x264_cabac_mb_cbp_chroma( h ); } + if( h->pps->b_transform_8x8_mode && h->mb.i_cbp_luma && !IS_INTRA(i_mb_type) + && x264_mb_transform_8x8_allowed( h, i_mb_type ) ) + { + x264_cabac_mb_transform_size( h ); + } + if( h->mb.i_cbp_luma > 0 || h->mb.i_cbp_chroma > 0 || i_mb_type == I_16x16 ) { x264_cabac_mb_qp_delta( h ); @@ -1151,24 +1170,22 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s ) /* DC Luma */ block_residual_write_cabac( h, 0, 0, h->dct.luma16x16_dc, 16 ); + /* AC Luma */ if( h->mb.i_cbp_luma != 0 ) - { - /* AC Luma */ for( i = 0; i < 16; i++ ) - { block_residual_write_cabac( h, 1, i, h->dct.block[i].residual_ac, 15 ); - } - } + } + else if( h->mb.b_transform_8x8 ) + { + for( i = 0; i < 4; i++ ) + if( h->mb.i_cbp_luma & ( 1 << i ) ) + block_residual_write_cabac( h, 5, i, h->dct.luma8x8[i], 64 ); } else { for( i = 0; i < 16; i++ ) - { if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) ) - { block_residual_write_cabac( h, 2, i, h->dct.block[i].luma4x4, 16 ); - } - } } if( h->mb.i_cbp_chroma &0x03 ) /* Chroma DC residual present */ @@ -1179,9 +1196,7 @@ void x264_macroblock_write_cabac( x264_t *h, bs_t *s ) if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */ { for( i = 0; i < 8; i++ ) - { block_residual_write_cabac( h, 4, i, h->dct.block[16+i].residual_ac, 15 ); - } } } diff --git a/encoder/cavlc.c b/encoder/cavlc.c index a9715c27..11a542eb 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -397,12 +397,15 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) } return; } - else if( i_mb_type == I_4x4 ) + else if( i_mb_type == I_4x4 || i_mb_type == I_8x8 ) { + int di = i_mb_type == I_8x8 ? 4 : 1; bs_write_ue( s, i_mb_i_offset + 0 ); + if( h->pps->b_transform_8x8_mode ) + bs_write1( s, h->mb.b_transform_8x8 ); /* Prediction: Luma */ - for( i = 0; i < 16; i++ ) + for( i = 0; i < 16; i += di ) { int i_pred = x264_mb_predict_intra4x4_mode( h, i ); int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]; @@ -640,7 +643,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) h->stat.frame.i_hdr_bits += i_mb_pos_tex - i_mb_pos_start; /* Coded block patern */ - if( i_mb_type == I_4x4 ) + if( i_mb_type == I_4x4 || i_mb_type == I_8x8 ) { bs_write_ue( s, intra4x4_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] ); } @@ -649,6 +652,13 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) bs_write_ue( s, inter_cbp_to_golomb[( h->mb.i_cbp_chroma << 4 )|h->mb.i_cbp_luma] ); } + /* transform size 8x8 flag */ + if( h->pps->b_transform_8x8_mode && h->mb.i_cbp_luma && !IS_INTRA(i_mb_type) + && x264_mb_transform_8x8_allowed( h, i_mb_type ) ) + { + bs_write1( s, h->mb.b_transform_8x8 ); + } + /* write residual */ if( i_mb_type == I_16x16 ) { @@ -670,6 +680,19 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s ) { bs_write_se( s, h->mb.qp[h->mb.i_mb_xy] - h->mb.i_last_qp ); + /* shuffle 8x8 dct coeffs into 4x4 lists */ + if( h->mb.b_transform_8x8 ) + { + int i4; + for( i4 = 0; i4 < 16; i4++ ) + { + for( i = 0; i < 16; i++ ) + h->dct.block[i4].luma4x4[i] = h->dct.luma8x8[i4>>2][(i4&3)+i*4]; + h->mb.cache.non_zero_count[x264_scan8[i4]] = + array_non_zero_count( h->dct.block[i4].luma4x4, 16 ); + } + } + for( i = 0; i < 16; i++ ) { if( h->mb.i_cbp_luma & ( 1 << ( i / 4 ) ) ) diff --git a/encoder/encoder.c b/encoder/encoder.c index 854aa792..e5032c8c 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -50,7 +50,7 @@ #endif //#define DEBUG_MB_TYPE -//#define DEBUG_DUMP_FRAME +#define DEBUG_DUMP_FRAME //#define DEBUG_BENCHMARK #ifdef DEBUG_BENCHMARK @@ -408,6 +408,11 @@ static int x264_validate_parameters( x264_t *h ) h->param.analyse.i_subpel_refine = x264_clip3( h->param.analyse.i_subpel_refine, 1, 5 ); if( !(h->param.analyse.inter & X264_ANALYSE_PSUB16x16) ) h->param.analyse.inter &= ~X264_ANALYSE_PSUB8x8; + if( !h->param.analyse.b_transform_8x8 ) + { + h->param.analyse.inter &= ~X264_ANALYSE_I8x8; + h->param.analyse.intra &= ~X264_ANALYSE_I8x8; + } h->param.analyse.i_chroma_qp_offset = x264_clip3(h->param.analyse.i_chroma_qp_offset, -12, 12); h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 2048); @@ -426,7 +431,9 @@ static int x264_validate_parameters( x264_t *h ) x264_t *x264_encoder_open ( x264_param_t *param ) { x264_t *h = x264_malloc( sizeof( x264_t ) ); - int i, i_slice; + int i; + + memset( h, 0, sizeof( x264_t ) ); /* Create a copy of param */ memcpy( &h->param, param, sizeof( x264_param_t ) ); @@ -536,6 +543,7 @@ x264_t *x264_encoder_open ( x264_param_t *param ) /* init CPU functions */ x264_predict_16x16_init( h->param.cpu, h->predict_16x16 ); + x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8 ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); @@ -548,21 +556,6 @@ x264_t *x264_encoder_open ( x264_param_t *param ) if( x264_ratecontrol_new( h ) < 0 ) return NULL; - /* stat */ - for( i_slice = 0; i_slice < 5; i_slice++ ) - { - h->stat.i_slice_count[i_slice] = 0; - h->stat.i_slice_size[i_slice] = 0; - h->stat.i_slice_qp[i_slice] = 0; - - h->stat.i_sqe_global[i_slice] = 0; - h->stat.f_psnr_average[i_slice] = 0.0; - h->stat.f_psnr_mean_y[i_slice] = h->stat.f_psnr_mean_u[i_slice] = h->stat.f_psnr_mean_v[i_slice] = 0.0; - - for( i = 0; i < 18; i++ ) - h->stat.i_mb_count[i_slice][i] = 0; - } - x264_log( h, X264_LOG_INFO, "using cpu capabilities %s%s%s%s%s%s\n", param->cpu&X264_CPU_MMX ? "MMX " : "", param->cpu&X264_CPU_MMXEXT ? "MMXEXT " : "", @@ -889,6 +882,7 @@ static int x264_slice_write( x264_t *h ) int i_skip; int mb_xy; + /* init stats */ memset( &h->stat.frame, 0, sizeof(h->stat.frame) ); /* Slice */ @@ -1468,7 +1462,7 @@ do_encode: h->stat.i_slice_size[i_slice_type] += i_frame_size + NALU_OVERHEAD; h->stat.i_slice_qp[i_slice_type] += i_global_qp; - for( i = 0; i < 18; i++ ) + for( i = 0; i < 19; i++ ) { h->stat.i_mb_count[h->sh.i_type][i] += h->stat.frame.i_mb_count[i]; } @@ -1500,13 +1494,14 @@ do_encode: } x264_log( h, X264_LOG_DEBUG, - "frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I4x4:%-4d I16x16:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n", + "frame=%4d QP=%i NAL=%d Slice:%c Poc:%-3d I4:%-4d I8:%-4d I16:%-4d P:%-4d SKIP:%-4d size=%d bytes%s\n", h->i_frame - 1, i_global_qp, i_nal_ref_idc, i_slice_type == SLICE_TYPE_I ? 'I' : (i_slice_type == SLICE_TYPE_P ? 'P' : 'B' ), frame_psnr->i_poc, h->stat.frame.i_mb_count[I_4x4], + h->stat.frame.i_mb_count[I_8x8], h->stat.frame.i_mb_count[I_16x16], h->stat.frame.i_mb_count_p, h->stat.frame.i_mb_count_skip, @@ -1516,12 +1511,12 @@ do_encode: #ifdef DEBUG_MB_TYPE { - static const char mb_chars[] = { 'i', 'I', 'C', 'P', '8', 'S', + static const char mb_chars[] = { 'i', 'i', 'I', 'C', 'P', '8', 'S', 'D', '<', 'X', 'B', 'X', '>', 'B', 'B', 'B', 'B', '8', 'S' }; int mb_xy; for( mb_xy = 0; mb_xy < h->sps->i_mb_width * h->sps->i_mb_height; mb_xy++ ) { - if( h->mb.type[mb_xy] < 18 && h->mb.type[mb_xy] >= 0 ) + if( h->mb.type[mb_xy] < 19 && h->mb.type[mb_xy] >= 0 ) fprintf( stderr, "%c ", mb_chars[ h->mb.type[mb_xy] ] ); else fprintf( stderr, "? " ); @@ -1609,8 +1604,9 @@ void x264_encoder_close ( x264_t *h ) const int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I]; const double i_count = h->stat.i_slice_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0; x264_log( h, X264_LOG_INFO, - "slice I Avg I4x4:%.1f%% I16x16:%.1f%%\n", + "slice I Avg I4x4:%.1f%% I8x8:%.1f%% I16x16:%.1f%%\n", i_mb_count[I_4x4] / i_count, + i_mb_count[I_8x8] / i_count, i_mb_count[I_16x16]/ i_count ); } if( h->stat.i_slice_count[SLICE_TYPE_P] > 0 ) @@ -1618,8 +1614,9 @@ void x264_encoder_close ( x264_t *h ) const int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P]; const double i_count = h->stat.i_slice_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0; x264_log( h, X264_LOG_INFO, - "slice P Avg I4x4:%.1f%% I16x16:%.1f%% P:%.1f%% P8x8:%.1f%% PSKIP:%.1f%%\n", + "slice P Avg I4x4:%.1f%% I8x8:%.1f%% I16x16:%.1f%% P:%.1f%% P8x8:%.1f%% PSKIP:%.1f%%\n", i_mb_count[I_4x4] / i_count, + i_mb_count[I_8x8] / i_count, i_mb_count[I_16x16]/ i_count, i_mb_count[P_L0] / i_count, i_mb_count[P_8x8] / i_count, @@ -1630,8 +1627,9 @@ void x264_encoder_close ( x264_t *h ) const int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B]; const double i_count = h->stat.i_slice_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0; x264_log( h, X264_LOG_INFO, - "slice B Avg I4x4:%.1f%% I16x16:%.1f%% P:%.1f%% B:%.1f%% B8x8:%.1f%% DIRECT:%.1f%% BSKIP:%.1f%%\n", + "slice B Avg I4x4:%.1f%% I8x8:%.1f%% I16x16:%.1f%% P:%.1f%% B:%.1f%% B8x8:%.1f%% DIRECT:%.1f%% BSKIP:%.1f%%\n", i_mb_count[I_4x4] / i_count, + i_mb_count[I_8x8] / i_count, i_mb_count[I_16x16] / i_count, (i_mb_count[B_L0_L0] + i_mb_count[B_L1_L1] + i_mb_count[B_L1_L0] + i_mb_count[B_L0_L1]) / i_count, (i_mb_count[B_BI_BI] + i_mb_count[B_L0_BI] + i_mb_count[B_L1_BI] + i_mb_count[B_BI_L0] + i_mb_count[B_BI_L1]) / i_count, diff --git a/encoder/macroblock.c b/encoder/macroblock.c index ae9d1eea..d0d2d2cc 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -39,26 +39,85 @@ static const uint8_t block_idx_y[16] = }; static const uint8_t block_idx_xy[4][4] = { - { 0, 2, 8, 10}, - { 1, 3, 9, 11}, - { 4, 6, 12, 14}, - { 5, 7, 13, 15} + { 0, 2, 8, 10 }, + { 1, 3, 9, 11 }, + { 4, 6, 12, 14 }, + { 5, 7, 13, 15 } }; static const int quant_mf[6][4][4] = { - { { 13107, 8066, 13107, 8066}, { 8066, 5243, 8066, 5243}, - { 13107, 8066, 13107, 8066}, { 8066, 5243, 8066, 5243} }, - { { 11916, 7490, 11916, 7490}, { 7490, 4660, 7490, 4660}, - { 11916, 7490, 11916, 7490}, { 7490, 4660, 7490, 4660} }, - { { 10082, 6554, 10082, 6554}, { 6554, 4194, 6554, 4194}, - { 10082, 6554, 10082, 6554}, { 6554, 4194, 6554, 4194} }, - { { 9362, 5825, 9362, 5825}, { 5825, 3647, 5825, 3647}, - { 9362, 5825, 9362, 5825}, { 5825, 3647, 5825, 3647} }, - { { 8192, 5243, 8192, 5243}, { 5243, 3355, 5243, 3355}, - { 8192, 5243, 8192, 5243}, { 5243, 3355, 5243, 3355} }, - { { 7282, 4559, 7282, 4559}, { 4559, 2893, 4559, 2893}, - { 7282, 4559, 7282, 4559}, { 4559, 2893, 4559, 2893} } + { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 }, + { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } }, + { { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 }, + { 11916, 7490, 11916, 7490 }, { 7490, 4660, 7490, 4660 } }, + { { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 }, + { 10082, 6554, 10082, 6554 }, { 6554, 4194, 6554, 4194 } }, + { { 9362, 5825, 9362, 5825 }, { 5825, 3647, 5825, 3647 }, + { 9362, 5825, 9362, 5825 }, { 5825, 3647, 5825, 3647 } }, + { { 8192, 5243, 8192, 5243 }, { 5243, 3355, 5243, 3355 }, + { 8192, 5243, 8192, 5243 }, { 5243, 3355, 5243, 3355 } }, + { { 7282, 4559, 7282, 4559 }, { 4559, 2893, 4559, 2893 }, + { 7282, 4559, 7282, 4559 }, { 4559, 2893, 4559, 2893 } } +}; + +const int quant8_mf[6][8][8] = +{ + { + { 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222 }, + { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 }, + { 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481 }, + { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 }, + { 13107, 12222, 16777, 12222, 13107, 12222, 16777, 12222 }, + { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 }, + { 16777, 15481, 20972, 15481, 16777, 15481, 20972, 15481 }, + { 12222, 11428, 15481, 11428, 12222, 11428, 15481, 11428 } + }, { + { 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058 }, + { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 }, + { 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290 }, + { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 }, + { 11916, 11058, 14980, 11058, 11916, 11058, 14980, 11058 }, + { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 }, + { 14980, 14290, 19174, 14290, 14980, 14290, 19174, 14290 }, + { 11058, 10826, 14290, 10826, 11058, 10826, 14290, 10826 } + }, { + { 10082, 9675, 12710, 9675, 10082, 9675, 12710, 9675 }, + { 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943 }, + { 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985 }, + { 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943 }, + { 10082, 9675, 12710, 9675, 10082, 9675, 12710, 9675 }, + { 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943 }, + { 12710, 11985, 15978, 11985, 12710, 11985, 15978, 11985 }, + { 9675, 8943, 11985, 8943, 9675, 8943, 11985, 8943 } + }, { + { 9362, 8931, 11984, 8931, 9362, 8931, 11984, 8931 }, + { 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228 }, + { 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259 }, + { 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228 }, + { 9362, 8931, 11984, 8931, 9362, 8931, 11984, 8931 }, + { 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228 }, + { 11984, 11259, 14913, 11259, 11984, 11259, 14913, 11259 }, + { 8931, 8228, 11259, 8228, 8931, 8228, 11259, 8228 } + }, { + { 8192, 7740, 10486, 7740, 8192, 7740, 10486, 7740 }, + { 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346 }, + { 10486, 9777, 13159, 9777, 10486, 9777, 13159, 9777 }, + { 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346 }, + { 8192, 7740, 10486, 7740, 8192, 7740, 10486, 7740 }, + { 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346 }, + { 10486, 9777, 13159, 9777, 10486, 9777, 13159, 9777 }, + { 7740, 7346, 9777, 7346, 7740, 7346, 9777, 7346 } + }, { + { 7282, 6830, 9118, 6830, 7282, 6830, 9118, 6830 }, + { 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428 }, + { 9118, 8640, 11570, 8640, 9118, 8640, 11570, 8640 }, + { 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428 }, + { 7282, 6830, 9118, 6830, 7282, 6830, 9118, 6830 }, + { 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428 }, + { 9118, 8640, 11570, 8640, 9118, 8640, 11570, 8640 }, + { 6830, 6428, 8640, 6428, 6830, 6428, 8640, 6428 } + } }; static const int i_chroma_qp_table[52] = @@ -77,67 +136,67 @@ static const int i_chroma_qp_table[52] = //static const int scan_zigzag_x[16]={0, 1, 0, 0, 1, 2, 3, 2, 1, 0, 1, 2, 3, 3, 2, 3}; //static const int scan_zigzag_y[16]={0, 0, 1, 2, 1, 0, 0, 1, 2, 3, 3, 2, 1, 2, 3, 3}; +#define ZIG(i,y,x) level[i] = dct[y][x]; +static inline void scan_zigzag_8x8full( int level[64], int16_t dct[8][8] ) +{ + ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) + ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) + ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1) + ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5) + ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1) + ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2) + ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6) + ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4) + ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0) + ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4) + ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7) + ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3) + ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5) + ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6) + ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6) + ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7) +} static inline void scan_zigzag_4x4full( int level[16], int16_t dct[4][4] ) { - level[0] = dct[0][0]; - level[1] = dct[0][1]; - level[2] = dct[1][0]; - level[3] = dct[2][0]; - level[4] = dct[1][1]; - level[5] = dct[0][2]; - level[6] = dct[0][3]; - level[7] = dct[1][2]; - level[8] = dct[2][1]; - level[9] = dct[3][0]; - level[10] = dct[3][1]; - level[11] = dct[2][2]; - level[12] = dct[1][3]; - level[13] = dct[2][3]; - level[14] = dct[3][2]; - level[15] = dct[3][3]; -#if 0 - int i; - for( i = 0; i < 16; i++ ) - { - level[i] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]]; - } -#endif + ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0) + ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2) + ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2) + ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) } static inline void scan_zigzag_4x4( int level[15], int16_t dct[4][4] ) { - level[0] = dct[0][1]; - level[1] = dct[1][0]; - level[2] = dct[2][0]; - level[3] = dct[1][1]; - level[4] = dct[0][2]; - level[5] = dct[0][3]; - level[6] = dct[1][2]; - level[7] = dct[2][1]; - level[8] = dct[3][0]; - level[9] = dct[3][1]; - level[10] = dct[2][2]; - level[11] = dct[1][3]; - level[12] = dct[2][3]; - level[13] = dct[3][2]; - level[14] = dct[3][3]; -#if 0 - int i; - for( i = 1; i < 16; i++ ) - { - level[i - 1] = dct[scan_zigzag_y[i]][scan_zigzag_x[i]]; - } -#endif + ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0) + ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2) + ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2) + ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3) } - static inline void scan_zigzag_2x2_dc( int level[4], int16_t dct[2][2] ) { - level[0] = dct[0][0]; - level[1] = dct[0][1]; - level[2] = dct[1][0]; - level[3] = dct[1][1]; + ZIG(0,0,0) + ZIG(1,0,1) + ZIG(2,1,0) + ZIG(3,1,1) } +#undef ZIG +static void quant_8x8( int16_t dct[8][8], int i_qscale, int b_intra ) +{ + const int i_qbits = 16 + i_qscale / 6; + const int i_mf = i_qscale % 6; + const int f = ( 1 << i_qbits ) / ( b_intra ? 3 : 6 ); + int x,y; + for( y = 0; y < 8; y++ ) + { + for( x = 0; x < 8; x++ ) + { + if( dct[y][x] > 0 ) + dct[y][x] = ( f + dct[y][x] * quant8_mf[i_mf][y][x] ) >> i_qbits; + else + dct[y][x] = - ( ( f - dct[y][x] * quant8_mf[i_mf][y][x] ) >> i_qbits ); + } + } +} static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra ) { const int i_qbits = 15 + i_qscale / 6; @@ -150,13 +209,9 @@ static void quant_4x4( int16_t dct[4][4], int i_qscale, int b_intra ) for( x = 0; x < 4; x++ ) { if( dct[y][x] > 0 ) - { - dct[y][x] =( f + dct[y][x] * quant_mf[i_mf][y][x] ) >> i_qbits; - } + dct[y][x] = ( f + dct[y][x] * quant_mf[i_mf][y][x] ) >> i_qbits; else - { - dct[y][x] = - ( ( f - dct[y][x] * quant_mf[i_mf][y][x] ) >> i_qbits ); - } + dct[y][x] = - ( ( f - dct[y][x] * quant_mf[i_mf][y][x] ) >> i_qbits ); } } } @@ -172,13 +227,9 @@ static void quant_4x4_dc( int16_t dct[4][4], int i_qscale ) for( x = 0; x < 4; x++ ) { if( dct[y][x] > 0 ) - { dct[y][x] =( f2 + dct[y][x] * i_qmf) >> ( 1 + i_qbits ); - } else - { dct[y][x] = - ( ( f2 - dct[y][x] * i_qmf ) >> (1 + i_qbits ) ); - } } } } @@ -194,13 +245,9 @@ static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra ) for( x = 0; x < 2; x++ ) { if( dct[y][x] > 0 ) - { dct[y][x] =( f2 + dct[y][x] * i_qmf) >> ( 1 + i_qbits ); - } else - { dct[y][x] = - ( ( f2 - dct[y][x] * i_qmf ) >> (1 + i_qbits ) ); - } } } } @@ -306,21 +353,6 @@ static void quant_2x2_dc( int16_t dct[2][2], int i_qscale, int b_intra ) #endif -static inline int array_non_zero_count( int *v, int i_count ) -{ - int i; - int i_nz; - - for( i = 0, i_nz = 0; i < i_count; i++ ) - { - if( v[i] ) - { - i_nz++; - } - } - return i_nz; -} - /* (ref: JVT-B118) * x264_mb_decimate_score: given dct coeffs it returns a score to see if we could empty this dct coeffs * to 0 (low score means set it to null) @@ -331,24 +363,27 @@ static inline int array_non_zero_count( int *v, int i_count ) */ static int x264_mb_decimate_score( int *dct, int i_max ) { - static const int i_ds_table[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - + static const int i_ds_table4[16] = { + 3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 }; + static const int i_ds_table8[64] = { + 3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1, + 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + + const int *ds_table = (i_max == 64) ? i_ds_table8 : i_ds_table4; int i_score = 0; int idx = i_max - 1; while( idx >= 0 && dct[idx] == 0 ) - { idx--; - } while( idx >= 0 ) { int i_run; if( abs( dct[idx--] ) > 1 ) - { return 9; - } i_run = 0; while( idx >= 0 && dct[idx] == 0 ) @@ -356,7 +391,7 @@ static int x264_mb_decimate_score( int *dct, int i_max ) idx--; i_run++; } - i_score += i_ds_table[i_run]; + i_score += ds_table[i_run]; } return i_score; @@ -365,23 +400,35 @@ static int x264_mb_decimate_score( int *dct, int i_max ) void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qscale ) { const int i_stride = h->mb.pic.i_stride[0]; - uint8_t *p_src = &h->mb.pic.p_fenc[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride]; - uint8_t *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride]; - + const int i_offset = 4 * block_idx_x[idx] + 4 * block_idx_y[idx] * i_stride; + uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset]; + uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset]; int16_t dct4x4[4][4]; h->dctf.sub4x4_dct( dct4x4, p_src, i_stride, p_dst, i_stride ); - quant_4x4( dct4x4, i_qscale, 1 ); - scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4 ); - x264_mb_dequant_4x4( dct4x4, i_qscale ); /* output samples to fdec */ h->dctf.add4x4_idct( p_dst, i_stride, dct4x4 ); } +void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qscale ) +{ + const int i_stride = h->mb.pic.i_stride[0]; + const int i_offset = 8 * (idx&1) + 8 * (idx>>1) * i_stride; + uint8_t *p_src = &h->mb.pic.p_fenc[0][i_offset]; + uint8_t *p_dst = &h->mb.pic.p_fdec[0][i_offset]; + int16_t dct8x8[8][8]; + + h->dctf.sub8x8_dct8( dct8x8, p_src, i_stride, p_dst, i_stride ); + quant_8x8( dct8x8, i_qscale, 1 ); + scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8 ); + x264_mb_dequant_8x8( dct8x8, i_qscale ); + h->dctf.add8x8_idct8( p_dst, i_stride, dct8x8 ); +} + static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) { const int i_stride = h->mb.pic.i_stride[0]; @@ -422,7 +469,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qscale ) h->dctf.add16x16_idct( p_dst, i_stride, &dct4x4[1] ); } -static void x264_mb_encode_8x8( x264_t *h, int b_inter, int i_qscale ) +static void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qscale ) { int i, ch; @@ -572,6 +619,19 @@ void x264_macroblock_encode( x264_t *h ) /* fix the pred mode value */ h->mb.i_intra16x16_pred_mode = x264_mb_pred_mode16x16_fix[i_mode]; } + else if( h->mb.i_type == I_8x8 ) + { + for( i = 0; i < 4; i++ ) + { + const int i_dst = h->mb.pic.i_stride[0]; + uint8_t *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * i_dst]; + int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]]; + + h->predict_8x8[i_mode]( p_dst, i_dst, h->mb.i_neighbour8[i] ); + x264_mb_encode_i8x8( h, i, i_qscale ); + h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]] = x264_mb_pred_mode4x4_fix(i_mode); + } + } else if( h->mb.i_type == I_4x4 ) { for( i = 0; i < 16; i++ ) @@ -580,83 +640,95 @@ void x264_macroblock_encode( x264_t *h ) uint8_t *p_dst = &h->mb.pic.p_fdec[0][4 * block_idx_x[i] + 4 * block_idx_y[i] * i_dst]; int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]; - /* Do the right prediction */ h->predict_4x4[i_mode]( p_dst, i_dst ); - - /* encode one 4x4 block */ x264_mb_encode_i4x4( h, i, i_qscale ); - - /* fix the pred mode value */ - h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix[i_mode]; + h->mb.cache.intra4x4_pred_mode[x264_scan8[i]] = x264_mb_pred_mode4x4_fix(i_mode); } } else /* Inter MB */ { - int16_t dct4x4[16][4][4]; - int i8x8, i4x4, idx; int i_decimate_mb = 0; /* Motion compensation */ x264_mb_mc( h ); - h->dctf.sub16x16_dct( dct4x4, - h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], - h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] ); - - for( i8x8 = 0; i8x8 < 4; i8x8++ ) + if( h->mb.b_transform_8x8 ) { - int i_decimate_8x8; + int16_t dct8x8[4][8][8]; + h->dctf.sub16x16_dct8( dct8x8, + h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], + h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] ); - /* encode one 4x4 block */ - i_decimate_8x8 = 0; - for( i4x4 = 0; i4x4 < 4; i4x4++ ) + for( idx = 0; idx < 4; idx++ ) { - idx = i8x8 * 4 + i4x4; + int i_decimate_8x8; - quant_4x4( dct4x4[idx], i_qscale, 0 ); - scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] ); - x264_mb_dequant_4x4( dct4x4[idx], i_qscale ); + quant_8x8( dct8x8[idx], i_qscale, 0 ); + scan_zigzag_8x8full( h->dct.luma8x8[idx], dct8x8[idx] ); + x264_mb_dequant_8x8( dct8x8[idx], i_qscale ); - i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 ); + i_decimate_8x8 = x264_mb_decimate_score( h->dct.luma8x8[idx], 64 ); + i_decimate_mb += i_decimate_8x8; + if( i_decimate_8x8 < 4 ) + { + memset( h->dct.luma8x8[idx], 0, sizeof( h->dct.luma8x8[idx] ) ); + memset( dct8x8[idx], 0, sizeof( dct8x8[idx] ) ); + } } - /* decimate this 8x8 block */ - i_decimate_mb += i_decimate_8x8; - if( i_decimate_8x8 < 4 ) + if( i_decimate_mb < 6 ) + memset( h->dct.luma8x8, 0, sizeof( h->dct.luma8x8 ) ); + else + h->dctf.add16x16_idct8( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct8x8 ); + } + else + { + int16_t dct4x4[16][4][4]; + h->dctf.sub16x16_dct( dct4x4, + h->mb.pic.p_fenc[0], h->mb.pic.i_stride[0], + h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0] ); + + for( i8x8 = 0; i8x8 < 4; i8x8++ ) { + int i_decimate_8x8; + + /* encode one 4x4 block */ + i_decimate_8x8 = 0; for( i4x4 = 0; i4x4 < 4; i4x4++ ) { - int x, y; idx = i8x8 * 4 + i4x4; - for( i = 0; i < 16; i++ ) - { - h->dct.block[idx].luma4x4[i] = 0; - } - for( x = 0; x < 4; x++ ) - { - for( y = 0; y < 4; y++ ) - { - dct4x4[idx][x][y] = 0; - } - } + + quant_4x4( dct4x4[idx], i_qscale, 0 ); + scan_zigzag_4x4full( h->dct.block[idx].luma4x4, dct4x4[idx] ); + x264_mb_dequant_4x4( dct4x4[idx], i_qscale ); + + i_decimate_8x8 += x264_mb_decimate_score( h->dct.block[idx].luma4x4, 16 ); } - } - } - if( i_decimate_mb < 6 ) - { - for( idx = 0; idx < 16; idx++ ) - { - for( i = 0; i < 16; i++ ) + /* decimate this 8x8 block */ + i_decimate_mb += i_decimate_8x8; + if( i_decimate_8x8 < 4 ) { - h->dct.block[idx].luma4x4[i] = 0; + for( i4x4 = 0; i4x4 < 4; i4x4++ ) + { + int x, y; + idx = i8x8 * 4 + i4x4; + for( i = 0; i < 16; i++ ) + h->dct.block[idx].luma4x4[i] = 0; + for( x = 0; x < 4; x++ ) + for( y = 0; y < 4; y++ ) + dct4x4[idx][x][y] = 0; + } } } - } - else - { - h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 ); + + if( i_decimate_mb < 6 ) + for( idx = 0; idx < 16; idx++ ) + for( i = 0; i < 16; i++ ) + h->dct.block[idx].luma4x4[i] = 0; + else + h->dctf.add16x16_idct( h->mb.pic.p_fdec[0], h->mb.pic.i_stride[0], dct4x4 ); } } @@ -666,41 +738,50 @@ void x264_macroblock_encode( x264_t *h ) { const int i_mode = h->mb.i_chroma_pred_mode; /* do the right prediction */ - h->predict_8x8[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] ); - h->predict_8x8[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1], h->mb.pic.i_stride[1] ); + h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2], h->mb.pic.i_stride[2] ); /* fix the pred mode value */ - h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8_fix[i_mode]; + h->mb.i_chroma_pred_mode = x264_mb_pred_mode8x8c_fix[i_mode]; } /* encode the 8x8 blocks */ - x264_mb_encode_8x8( h, !IS_INTRA( h->mb.i_type ), i_qscale ); + x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), i_qscale ); /* Calculate the Luma/Chroma patern and non_zero_count */ + h->mb.i_cbp_luma = 0x00; if( h->mb.i_type == I_16x16 ) { - h->mb.i_cbp_luma = 0x00; for( i = 0; i < 16; i++ ) { const int nz = array_non_zero_count( h->dct.block[i].residual_ac, 15 ); h->mb.cache.non_zero_count[x264_scan8[i]] = nz; if( nz > 0 ) - { h->mb.i_cbp_luma = 0x0f; - } + } + } + else if( h->mb.b_transform_8x8 ) + { + /* coded_block_flag is enough for CABAC, + * but CAVLC needs the full non_zero_count. */ + for( i = 0; i < 4; i++ ) + { + const int nz = array_non_zero( h->dct.luma8x8[i], 64 ); + int j; + for( j = 0; j < 4; j++ ) + h->mb.cache.non_zero_count[x264_scan8[4*i+j]] = nz; + if( nz > 0 ) + h->mb.i_cbp_luma |= 1 << i; } } else { - h->mb.i_cbp_luma = 0x00; for( i = 0; i < 16; i++ ) { const int nz = array_non_zero_count( h->dct.block[i].luma4x4, 16 ); h->mb.cache.non_zero_count[x264_scan8[i]] = nz; if( nz > 0 ) - { h->mb.i_cbp_luma |= 1 << (i/4); - } } } @@ -772,6 +853,9 @@ void x264_macroblock_encode( x264_t *h ) h->mb.type[h->mb.i_mb_xy] = h->mb.i_type = B_SKIP; h->mb.qp[h->mb.i_mb_xy] = h->mb.i_last_qp; /* Needed */ } + + if( h->mb.i_cbp_luma == 0 && h->mb.i_type != I_8x8 ) + h->mb.b_transform_8x8 = 0; } /***************************************************************************** diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 6c8768ae..a16bcf10 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -39,5 +39,26 @@ void x264_macroblock_write_cavlc ( x264_t *h, bs_t *s ); void x264_cabac_mb_skip( x264_t *h, int b_skip ); +static inline int array_non_zero( int *v, int i_count ) +{ + int i; + for( i = 0; i < i_count; i++ ) + if( v[i] ) return 1; + return 0; +} + +static inline int array_non_zero_count( int *v, int i_count ) +{ + int i; + int i_nz; + + for( i = 0, i_nz = 0; i < i_count; i++ ) + if( v[i] ) + i_nz++; + + return i_nz; +} + + #endif diff --git a/encoder/set.c b/encoder/set.c index 8406118d..4f94bad9 100644 --- a/encoder/set.c +++ b/encoder/set.c @@ -40,12 +40,14 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param ) { - sps->i_id = i_id; + sps->i_id = i_id; - if( param->b_cabac || param->i_bframe > 0 ) - sps->i_profile_idc = PROFILE_MAIN; + if( param->analyse.b_transform_8x8 ) + sps->i_profile_idc = PROFILE_HIGH; + else if( param->b_cabac || param->i_bframe > 0 ) + sps->i_profile_idc = PROFILE_MAIN; else - sps->i_profile_idc = PROFILE_BASELINE; + sps->i_profile_idc = PROFILE_BASELINE; sps->i_level_idc = param->i_level_idc; sps->b_constraint_set0 = 0; @@ -160,6 +162,16 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps ) bs_write( s, 8, sps->i_level_idc ); bs_write_ue( s, sps->i_id ); + + if( sps->i_profile_idc >= PROFILE_HIGH ) + { + bs_write_ue( s, 1 ); // chroma_format_idc = 4:2:0 + bs_write_ue( s, 0 ); // bit_depth_luma_minus8 + bs_write_ue( s, 0 ); // bit_depth_chroma_minus8 + bs_write( s, 1, 0 ); // qpprime_y_zero_transform_bypass_flag + bs_write( s, 1, 0 ); // seq_scaling_matrix_present_flag + } + bs_write_ue( s, sps->i_log2_max_frame_num - 4 ); bs_write_ue( s, sps->i_poc_type ); if( sps->i_poc_type == 0 ) @@ -326,6 +338,8 @@ void x264_pps_init( x264_pps_t *pps, int i_id, x264_param_t *param, x264_sps_t * pps->b_deblocking_filter_control = 1; pps->b_constrained_intra_pred = 0; pps->b_redundant_pic_cnt = 0; + + pps->b_transform_8x8_mode = param->analyse.b_transform_8x8 ? 1 : 0; } void x264_pps_write( bs_t *s, x264_pps_t *pps ) @@ -389,6 +403,13 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps ) bs_write( s, 1, pps->b_constrained_intra_pred ); bs_write( s, 1, pps->b_redundant_pic_cnt ); + if( pps->b_transform_8x8_mode ) + { + bs_write( s, 1, pps->b_transform_8x8_mode ); + bs_write( s, 1, 0 ); // pic_scaling_matrix_present_flag + bs_write_se( s, 0 ); // second_chroma_qp_index_offset + } + bs_rbsp_trailing( s ); } diff --git a/encoder/slicetype_decision.c b/encoder/slicetype_decision.c index b24d891b..eadce569 100644 --- a/encoder/slicetype_decision.c +++ b/encoder/slicetype_decision.c @@ -197,7 +197,7 @@ lowres_intra_mb: for( i = I_PRED_CHROMA_DC; i <= I_PRED_CHROMA_P; i++ ) { int i_cost; - h->predict_8x8[i]( &pix1[10], 9 ); + h->predict_8x8c[i]( &pix1[10], 9 ); i_cost = h->pixf.satd[PIXEL_8x8]( &pix1[10], 9, src, i_stride ) + intra_penalty; i_bcost = X264_MIN( i_bcost, i_cost ); } diff --git a/x264.h b/x264.h index a1a0131a..14a11d21 100644 --- a/x264.h +++ b/x264.h @@ -26,7 +26,7 @@ #include -#define X264_BUILD 28 +#define X264_BUILD 29 /* x264_t: * opaque handler for decoder and encoder */ @@ -48,6 +48,7 @@ typedef struct x264_t x264_t; /* Analyse flags */ #define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */ +#define X264_ANALYSE_I8x8 0x0002 /* Analyse i8x8 (requires 8x8 transform) */ #define X264_ANALYSE_PSUB16x16 0x0010 /* Analyse p16x8, p8x16 and p8x8 */ #define X264_ANALYSE_PSUB8x8 0x0020 /* Analyse p8x4, p4x8, p4x4 */ #define X264_ANALYSE_BSUB16x16 0x0100 /* Analyse b16x8, b8x16 and b8x8 */ @@ -149,8 +150,10 @@ typedef struct /* Encoder analyser parameters */ struct { - unsigned int intra; /* intra flags */ - unsigned int inter; /* inter flags */ + unsigned int intra; /* intra partitions */ + unsigned int inter; /* inter partitions */ + + int b_transform_8x8; int i_direct_mv_pred; /* spatial vs temporal mv prediction */ int i_me_method; /* motion estimation algorithm to use (X264_ME_*) */