From 5b0cb86f27ba0c5433c404bed51c06a5124dfb49 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Fri, 26 Aug 2011 15:57:04 +0200 Subject: [PATCH] 4:2:2 encoding support --- AUTHORS | 5 + common/bitstream.h | 7 +- common/common.c | 68 +++++--- common/common.h | 19 +- common/dct.c | 91 +++++++++- common/dct.h | 4 + common/deblock.c | 142 +++++++++------ common/frame.c | 88 +++++----- common/frame.h | 8 + common/macroblock.c | 169 ++++++++++-------- common/macroblock.h | 9 - common/mc.c | 23 ++- common/mc.h | 23 ++- common/pixel.c | 88 ++++++---- common/pixel.h | 61 ++++--- common/predict.c | 175 ++++++++++++++++++- common/predict.h | 7 +- common/quant.c | 198 ++++++++++++++------- common/quant.h | 10 +- common/set.h | 9 +- common/vlc.c | 115 +++++++++++- common/x86/mc-a2.asm | 21 +-- common/x86/mc-c.c | 50 +++--- common/x86/quant-a.asm | 14 +- common/x86/quant.h | 8 +- encoder/analyse.c | 227 +++++++++++++----------- encoder/cabac.c | 200 +++++++++++++-------- encoder/cavlc.c | 67 ++++--- encoder/encoder.c | 103 ++++++++--- encoder/macroblock.c | 386 ++++++++++++++++++++++++++++------------- encoder/macroblock.h | 9 +- encoder/me.c | 52 +++--- encoder/ratecontrol.c | 26 +-- encoder/rdo.c | 103 ++++++----- encoder/set.c | 27 +-- encoder/slicetype.c | 35 ++-- filters/video/depth.c | 8 +- filters/video/resize.c | 7 +- input/avs.c | 23 ++- input/input.c | 4 +- input/input.h | 3 +- tools/checkasm.c | 290 +++++++++++++++++++++---------- x264.c | 7 +- x264.h | 21 ++- 44 files changed, 2044 insertions(+), 966 deletions(-) diff --git a/AUTHORS b/AUTHORS index 8acaba47..60ffb621 100644 --- a/AUTHORS +++ b/AUTHORS @@ -42,6 +42,11 @@ E: gpoirier CHEZ mplayerhq POINT hu D: Altivec optimizations S: Brittany, France +N: Henrik Gramner +E: hengar-6 AT student DOT ltu DOT se +D: 4:2:2 chroma subsampling, x86 asm +S: Sweden + N: Fiona Glaser E: fiona AT x264 DOT com D: x86 asm, 1pass VBV, adaptive quantization, inline asm diff --git a/common/bitstream.h b/common/bitstream.h index 6300e52a..058db8b4 100644 --- a/common/bitstream.h +++ b/common/bitstream.h @@ -60,10 +60,11 @@ typedef struct uint8_t run[16]; } x264_run_level_t; -extern const vlc_t x264_coeff0_token[5]; -extern const vlc_t x264_coeff_token[5][16][4]; +extern const vlc_t x264_coeff0_token[6]; +extern const vlc_t x264_coeff_token[6][16][4]; extern const vlc_t x264_total_zeros[15][16]; -extern const vlc_t x264_total_zeros_dc[3][4]; +extern const vlc_t x264_total_zeros_2x2_dc[3][4]; +extern const vlc_t x264_total_zeros_2x4_dc[7][8]; extern const vlc_t x264_run_before[7][16]; typedef struct diff --git a/common/common.c b/common/common.c index ce076e59..4c978d3c 100644 --- a/common/common.c +++ b/common/common.c @@ -426,21 +426,57 @@ void x264_param_apply_fastfirstpass( x264_param_t *param ) } } +static int profile_string_to_int( const char *str ) +{ + if( !strcasecmp( str, "baseline" ) ) + return PROFILE_BASELINE; + if( !strcasecmp( str, "main" ) ) + return PROFILE_MAIN; + if( !strcasecmp( str, "high" ) ) + return PROFILE_HIGH; + if( !strcasecmp( str, "high10" ) ) + return PROFILE_HIGH10; + if( !strcasecmp( str, "high422" ) ) + return PROFILE_HIGH422; + if( !strcasecmp( str, "high444" ) ) + return PROFILE_HIGH444_PREDICTIVE; + return -1; +} + int x264_param_apply_profile( x264_param_t *param, const char *profile ) { if( !profile ) return 0; -#if BIT_DEPTH > 8 - if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) || - !strcasecmp( profile, "high" ) ) + int p = profile_string_to_int( profile ); + if( p < 0 ) { - x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH ); + x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile ); + return -1; + } + if( p < PROFILE_HIGH444_PREDICTIVE && ((param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) || + (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0)) ) + { + x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile ); + return -1; + } + if( p < PROFILE_HIGH444_PREDICTIVE && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I444 ) + { + x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:4:4\n", profile ); + return -1; + } + if( p < PROFILE_HIGH422 && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I422 ) + { + x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:2:2\n", profile ); + return -1; + } + if( p < PROFILE_HIGH10 && BIT_DEPTH > 8 ) + { + x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d\n", profile, BIT_DEPTH ); return -1; } -#endif - if( !strcasecmp( profile, "baseline" ) ) + if( p == PROFILE_BASELINE ) { param->analyse.b_transform_8x8 = 0; param->b_cabac = 0; @@ -459,27 +495,12 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile ) return -1; } } - else if( !strcasecmp( profile, "main" ) ) + else if( p == PROFILE_MAIN ) { param->analyse.b_transform_8x8 = 0; param->i_cqm_preset = X264_CQM_FLAT; param->psz_cqm_file = NULL; } - else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) ) - { - /* Default */ - } - else - { - x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile ); - return -1; - } - if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) || - (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0) ) - { - x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile ); - return -1; - } return 0; } @@ -1075,6 +1096,9 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_NV12] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, }, + [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, + [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, + [X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, }, [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, }, diff --git a/common/common.h b/common/common.h index a4e1cf96..d1f830f6 100644 --- a/common/common.h +++ b/common/common.h @@ -40,6 +40,9 @@ #define IS_DISPOSABLE(type) ( type == X264_TYPE_B ) #define FIX8(f) ((int)(f*(1<<8)+.5)) #define ALIGN(x,a) (((x)+((a)-1))&~((a)-1)) +#define CHROMA_FORMAT h->sps->i_chroma_format_idc +#define CHROMA_SIZE(s) ((s)>>(h->mb.chroma_h_shift+h->mb.chroma_v_shift)) +#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s)) #define CHECKED_MALLOC( var, size )\ do {\ @@ -56,7 +59,7 @@ do {\ #define X264_BFRAME_MAX 16 #define X264_REF_MAX 16 #define X264_THREAD_MAX 128 -#define X264_PCM_COST ((384<sps->i_chroma_format_idc == 3) +#define CHROMA444 (CHROMA_FORMAT == CHROMA_444) /* Unions for type-punning. * Mn: load or store n bits, aligned, native-endian @@ -565,7 +568,7 @@ struct x264_t struct { ALIGNED_16( dctcoef luma16x16_dc[3][16] ); - ALIGNED_16( dctcoef chroma_dc[2][4] ); + ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? ALIGNED_16( dctcoef luma8x8[12][64] ); ALIGNED_16( dctcoef luma4x4[16*3][16] ); @@ -578,6 +581,10 @@ struct x264_t int i_mb_height; int i_mb_count; /* number of mbs in a frame */ + /* Chroma subsampling */ + int chroma_h_shift; + int chroma_v_shift; + /* Strides */ int i_mb_stride; int i_b8_stride; @@ -882,6 +889,8 @@ struct x264_t ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] ); uint32_t nr_count_buf[2][4]; + uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */ + /* Buffers that are allocated per-thread even in sliced threads. */ void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */ pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */ @@ -891,9 +900,11 @@ struct x264_t /* CPU functions dependents */ x264_predict_t predict_16x16[4+3]; - x264_predict_t predict_8x8c[4+3]; x264_predict8x8_t predict_8x8[9+3]; x264_predict_t predict_4x4[9+3]; + x264_predict_t predict_chroma[4+3]; + x264_predict_t predict_8x8c[4+3]; + x264_predict_t predict_8x16c[4+3]; x264_predict_8x8_filter_t predict_8x8_filter; x264_pixel_function_t pixf; diff --git a/common/dct.c b/common/dct.c index 9653ee47..cf8a2351 100644 --- a/common/dct.c +++ b/common/dct.c @@ -5,6 +5,7 @@ * * Authors: Loren Merritt * Laurent Aimar + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -100,6 +101,42 @@ static void idct4x4dc( dctcoef d[16] ) } } +static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] ) +{ + int a0 = dct4x4[0][0] + dct4x4[1][0]; + int a1 = dct4x4[2][0] + dct4x4[3][0]; + int a2 = dct4x4[4][0] + dct4x4[5][0]; + int a3 = dct4x4[6][0] + dct4x4[7][0]; + int a4 = dct4x4[0][0] - dct4x4[1][0]; + int a5 = dct4x4[2][0] - dct4x4[3][0]; + int a6 = dct4x4[4][0] - dct4x4[5][0]; + int a7 = dct4x4[6][0] - dct4x4[7][0]; + int b0 = a0 + a1; + int b1 = a2 + a3; + int b2 = a4 + a5; + int b3 = a6 + a7; + int b4 = a0 - a1; + int b5 = a2 - a3; + int b6 = a4 - a5; + int b7 = a6 - a7; + dct[0] = b0 + b1; + dct[1] = b2 + b3; + dct[2] = b0 - b1; + dct[3] = b2 - b3; + dct[4] = b4 - b5; + dct[5] = b6 - b7; + dct[6] = b4 + b5; + dct[7] = b6 + b7; + dct4x4[0][0] = 0; + dct4x4[1][0] = 0; + dct4x4[2][0] = 0; + dct4x4[3][0] = 0; + dct4x4[4][0] = 0; + dct4x4[5][0] = 0; + dct4x4[6][0] = 0; + dct4x4[7][0] = 0; +} + static inline void pixel_sub_wxh( dctcoef *diff, int i_size, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) { @@ -164,14 +201,10 @@ static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 ) static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 ) { - dctcoef d[16]; int sum = 0; - - pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); - - sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7]; - sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15]; - + for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE ) + sum += pix1[0] + pix1[1] + pix1[2] + pix1[3] + - pix2[0] - pix2[1] - pix2[2] - pix2[3]; return sum; } @@ -188,11 +221,49 @@ static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 ) int d2 = dct[0] - dct[1]; int d3 = dct[2] - dct[3]; dct[0] = d0 + d1; - dct[2] = d2 + d3; dct[1] = d0 - d1; + dct[2] = d2 + d3; dct[3] = d2 - d3; } +static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 ) +{ + int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] ); + int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] ); + int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] ); + int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] ); + int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] ); + int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] ); + int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] ); + int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] ); + + /* 2x4 DC transform */ + int b0 = a0 + a1; + int b1 = a2 + a3; + int b2 = a4 + a5; + int b3 = a6 + a7; + int b4 = a0 - a1; + int b5 = a2 - a3; + int b6 = a4 - a5; + int b7 = a6 - a7; + a0 = b0 + b1; + a1 = b2 + b3; + a2 = b4 + b5; + a3 = b6 + b7; + a4 = b0 - b1; + a5 = b2 - b3; + a6 = b4 - b5; + a7 = b6 - b7; + dct[0] = a0 + a1; + dct[1] = a2 + a3; + dct[2] = a0 - a1; + dct[3] = a2 - a3; + dct[4] = a4 - a5; + dct[5] = a6 - a7; + dct[6] = a4 + a5; + dct[7] = a6 + a7; +} + static void add4x4_idct( pixel *p_dst, dctcoef dct[16] ) { dctcoef d[16]; @@ -408,6 +479,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->add8x8_idct = add8x8_idct; dctf->add8x8_idct_dc = add8x8_idct_dc; + dctf->sub8x16_dct_dc = sub8x16_dct_dc; + dctf->sub16x16_dct = sub16x16_dct; dctf->add16x16_idct = add16x16_idct; dctf->add16x16_idct_dc = add16x16_idct_dc; @@ -421,6 +494,8 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->dct4x4dc = dct4x4dc; dctf->idct4x4dc = idct4x4dc; + dctf->dct2x4dc = dct2x4dc; + #if HIGH_BIT_DEPTH #if HAVE_MMX if( cpu&X264_CPU_MMX ) diff --git a/common/dct.h b/common/dct.h index a764e491..044ad1e1 100644 --- a/common/dct.h +++ b/common/dct.h @@ -104,6 +104,8 @@ typedef struct void (*add8x8_idct) ( pixel *p_dst, dctcoef dct[4][16] ); void (*add8x8_idct_dc) ( pixel *p_dst, dctcoef dct[4] ); + void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 ); + void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 ); void (*add16x16_idct)( pixel *p_dst, dctcoef dct[16][16] ); void (*add16x16_idct_dc) ( pixel *p_dst, dctcoef dct[16] ); @@ -117,6 +119,8 @@ typedef struct void (*dct4x4dc) ( dctcoef d[16] ); void (*idct4x4dc)( dctcoef d[16] ); + void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] ); + } x264_dct_function_t; typedef struct diff --git a/common/deblock.c b/common/deblock.c index 22d37635..a1108b20 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -6,6 +6,7 @@ * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -119,7 +120,7 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] ); } } -static void deblock_v_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { for( int d = 0; d < 8; d++, pix += stride ) deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] ); @@ -147,33 +148,42 @@ static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int al pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } } -static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) +static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) { int tc = tc0[i]; if( tc <= 0 ) { - pix += 2*ystride; + pix += height*ystride; continue; } - for( int d = 0; d < 2; d++, pix += ystride-2 ) - for( int e = 0; e < 2; e++, pix++ ) - deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); + for( int d = 0; d < height; d++, pix += ystride-2 ) + for( int e = 0; e < 2; e++, pix++ ) + deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); } } -static void deblock_v_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++, pix += stride ) deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] ); } +static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + for( int i = 0; i < 8; i++, pix += stride ) + deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i>>1] ); +} static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 ); } static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) { - deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 ); + deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 ); +} +static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +{ + deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 ); } static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta ) @@ -220,7 +230,7 @@ static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, i for( int d = 0; d < 16; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, xstride, alpha, beta ); } -static void deblock_v_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta ) +static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta ) { for( int d = 0; d < 8; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, 1, alpha, beta ); @@ -247,24 +257,33 @@ static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } } -static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir ) +static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta ) { - for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 ) - for( int e = 0; e < (dir?1:2); e++, pix++ ) - deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); + for( int d = 0; d < height; d++, pix += ystride-2 ) + for( int e = 0; e < width; e++, pix++ ) + deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); } -static void deblock_v_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta ) { for( int i = 0; i < 4; i++, pix += stride ) deblock_edge_chroma_intra_c( pix, 2, alpha, beta ); } +static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta ) +{ + for( int i = 0; i < 8; i++, pix += stride ) + deblock_edge_chroma_intra_c( pix, 2, alpha, beta ); +} static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 ); + deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta ); } static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) { - deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 ); + deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta ); +} +static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta ) +{ + deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta ); } static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], @@ -375,6 +394,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int stridey = h->fdec->i_stride[0]; int strideuv = h->fdec->i_stride[1]; int chroma444 = CHROMA444; + int chroma_height = 16 >> h->mb.chroma_v_shift; intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1; for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced ) @@ -388,12 +408,12 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x]; pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x; - pixel *pixuv = h->fdec->plane[1] + (8<fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x; if( mb_y & MB_INTERLACED ) { pixy -= 15*stridey; - pixuv -= ((8<loopf.deblock_luma##intra[dir] );\ - if( chroma444 )\ + if( !(edge & 1) || !transform_8x8 )\ {\ - deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\ - stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ - h->loopf.deblock_luma##intra[dir] );\ - deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\ - stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ + deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\ + stride2y, bs[dir][edge], qp, a, b, 0,\ h->loopf.deblock_luma##intra[dir] );\ + if( CHROMA_FORMAT == CHROMA_444 )\ + {\ + deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ + h->loopf.deblock_luma##intra[dir] );\ + deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\ + h->loopf.deblock_luma##intra[dir] );\ + }\ + else if( CHROMA_FORMAT == CHROMA_420 && !(edge & 1) )\ + {\ + deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\ + stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\ + h->loopf.deblock_chroma##intra[dir] );\ + }\ }\ - else if( !(edge & 1) )\ - deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\ + if( CHROMA_FORMAT == CHROMA_422 && (dir || !(edge & 1)) )\ + {\ + deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\ stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\ h->loopf.deblock_chroma##intra[dir] );\ + }\ } while(0) if( h->mb.i_neighbour & MB_LEFT ) @@ -431,9 +462,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) int chroma_qp[2]; int left_qp[2]; x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff; - x264_deblock_inter_t chroma_deblock = chroma444 ? h->loopf.deblock_luma_mbaff : h->loopf.deblock_chroma_mbaff; + x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff; x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff; - x264_deblock_intra_t chroma_intra_deblock = chroma444 ? h->loopf.deblock_luma_intra_mbaff : h->loopf.deblock_chroma_intra_mbaff; + x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff; int c = chroma444 ? 0 : 1; left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]]; @@ -453,8 +484,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) } int offy = MB_INTERLACED ? 4 : 0; - int offuv = MB_INTERLACED ? 3 : 0; - if( chroma444 ) offuv = offy; + int offuv = MB_INTERLACED ? 4-h->mb.chroma_v_shift : 0; left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]]; luma_qp[1] = (qp + left_qp[1] + 1) >> 1; chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1; @@ -486,9 +516,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) } if( !first_edge_only ) { - if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc ); - FILTER( , 0, 2, qp, qpc ); - if( !transform_8x8 ) FILTER( , 0, 3, qp, qpc ); + FILTER( , 0, 1, qp, qpc ); + FILTER( , 0, 2, qp, qpc ); + FILTER( , 0, 3, qp, qpc ); } if( h->mb.i_neighbour & MB_TOP ) @@ -540,9 +570,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) if( !first_edge_only ) { - if( !transform_8x8 ) FILTER( , 1, 1, qp, qpc ); - FILTER( , 1, 2, qp, qpc ); - if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc ); + FILTER( , 1, 1, qp, qpc ); + FILTER( , 1, 2, qp, qpc ); + FILTER( , 1, 3, qp, qpc ); } #undef FILTER @@ -553,7 +583,7 @@ void x264_frame_deblock_row( x264_t *h, int mb_y ) * TODO: * deblock macroblock edges * support analysis partitions smaller than 16x16 - * deblock chroma for 4:2:0 + * deblock chroma for 4:2:0/4:2:2 * handle duplicate refs correctly * handle cavlc+8x8dct correctly */ @@ -683,15 +713,19 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[1] = deblock_v_luma_c; pf->deblock_luma[0] = deblock_h_luma_c; pf->deblock_chroma[1] = deblock_v_chroma_c; - pf->deblock_chroma[0] = deblock_h_chroma_c; + pf->deblock_h_chroma_420 = deblock_h_chroma_c; + pf->deblock_h_chroma_422 = deblock_h_chroma_422_c; pf->deblock_luma_intra[1] = deblock_v_luma_intra_c; pf->deblock_luma_intra[0] = deblock_h_luma_intra_c; pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c; - pf->deblock_chroma_intra[0] = deblock_h_chroma_intra_c; - pf->deblock_luma_mbaff = deblock_v_luma_mbaff_c; - pf->deblock_chroma_mbaff = deblock_v_chroma_mbaff_c; - pf->deblock_luma_intra_mbaff = deblock_v_luma_intra_mbaff_c; - pf->deblock_chroma_intra_mbaff = deblock_v_chroma_intra_mbaff_c; + pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c; + pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c; + pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c; + pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c; + pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c; + pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c; + pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c; + pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c; pf->deblock_strength = deblock_strength_c; #if HAVE_MMX @@ -701,11 +735,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[1] = x264_deblock_v_luma_mmx2; pf->deblock_luma[0] = x264_deblock_h_luma_mmx2; pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2; - pf->deblock_chroma[0] = x264_deblock_h_chroma_mmx2; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2; - pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmx2; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2; #endif pf->deblock_strength = x264_deblock_strength_mmx2; if( cpu&X264_CPU_SSE2 ) @@ -716,11 +750,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[1] = x264_deblock_v_luma_sse2; pf->deblock_luma[0] = x264_deblock_h_luma_sse2; pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2; - pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2; - pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2; } } if( cpu&X264_CPU_SSSE3 ) @@ -733,11 +767,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[1] = x264_deblock_v_luma_avx; pf->deblock_luma[0] = x264_deblock_h_luma_avx; pf->deblock_chroma[1] = x264_deblock_v_chroma_avx; - pf->deblock_chroma[0] = x264_deblock_h_chroma_avx; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx; - pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx; } } } @@ -758,7 +792,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; // pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; -// pf->deblock_chroma[0] = x264_deblock_h_chroma_neon; +// pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; } #endif #endif // !HIGH_BIT_DEPTH diff --git a/common/frame.c b/common/frame.c index b95c2a86..594aeccc 100644 --- a/common/frame.c +++ b/common/frame.c @@ -50,6 +50,10 @@ static int x264_frame_internal_csp( int external_csp ) case X264_CSP_I420: case X264_CSP_YV12: return X264_CSP_NV12; + case X264_CSP_NV16: + case X264_CSP_I422: + case X264_CSP_YV16: + return X264_CSP_NV16; case X264_CSP_I444: case X264_CSP_YV24: case X264_CSP_BGR: @@ -66,11 +70,10 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) x264_frame_t *frame; int i_csp = x264_frame_internal_csp( h->param.i_csp ); int i_mb_count = h->mb.i_mb_count; - int i_stride, i_width, i_lines; + int i_stride, i_width, i_lines, luma_plane_count; int i_padv = PADV << PARAM_INTERLACED; int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16; int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10; - int luma_plane_count = i_csp == X264_CSP_NV12 ? 1 : 3; CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) ); @@ -79,18 +82,20 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) i_lines = h->mb.i_mb_height*16; i_stride = align_stride( i_width + 2*PADH, align, disalign ); - if( i_csp == X264_CSP_NV12 ) + if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { + luma_plane_count = 1; frame->i_plane = 2; for( int i = 0; i < 2; i++ ) { frame->i_width[i] = i_width >> i; - frame->i_lines[i] = i_lines >> i; + frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12); frame->i_stride[i] = i_stride; } } else if( i_csp == X264_CSP_I444 ) { + luma_plane_count = 3; frame->i_plane = 3; for( int i = 0; i < 3; i++ ) { @@ -130,15 +135,16 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) frame->orig = frame; - if( i_csp == X264_CSP_NV12 ) + if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { - int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv)); + int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12); + int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv)); CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) ); - frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH; + frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH; if( PARAM_INTERLACED ) { CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) ); - frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH; + frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH; } } @@ -367,23 +373,25 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) } else { + int v_shift = h->mb.chroma_v_shift; get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 ); h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0], stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height ); - if( i_csp == X264_CSP_NV12 ) + if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 ) { - get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 ); + get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift ); h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], - stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>1 ); + stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift ); } - else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_YV12 ) + else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 ) { - get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 ); - get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 ); + int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16; + get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift ); + get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift ); h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], stride[1]/sizeof(pixel), (pixel*)pix[2], stride[2]/sizeof(pixel), - h->param.i_width>>1, h->param.i_height>>1 ); + h->param.i_width>>1, h->param.i_height>>v_shift ); } else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 ) { @@ -478,33 +486,34 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e return; for( int i = 0; i < frame->i_plane; i++ ) { - int shift = i && !CHROMA444; + int h_shift = i && h->mb.chroma_h_shift; + int v_shift = i && h->mb.chroma_v_shift; int stride = frame->i_stride[i]; int width = 16*h->mb.i_mb_width; - int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> shift; + int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift; int padh = PADH; - int padv = PADV >> shift; + int padv = PADV >> v_shift; // buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb if( b_end && !b_start ) - height += 4 >> (shift + SLICE_MBAFF); + height += 4 >> (v_shift + SLICE_MBAFF); pixel *pix; if( SLICE_MBAFF ) { // border samples for each field are extended separately - pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift); - plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, shift ); - plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, shift ); + pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift); + plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, h_shift ); + plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, h_shift ); - height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> shift; + height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift; if( b_end && !b_start ) - height += 4 >> shift; - pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift); - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift ); + height += 4 >> v_shift; + pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift ); } else { - pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift); - plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift ); + pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift); + plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift ); } } } @@ -545,9 +554,9 @@ void x264_frame_expand_border_lowres( x264_frame_t *frame ) void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane ) { - int shift = !CHROMA444; - plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>shift, - PADH, PADV>>shift, 1, 1, shift ); + int v_shift = h->mb.chroma_v_shift; + plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift, + PADH, PADV>>v_shift, 1, 1, h->mb.chroma_h_shift ); } void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ) @@ -555,17 +564,18 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame ) for( int i = 0; i < frame->i_plane; i++ ) { int i_width = h->param.i_width; - int shift = i && !CHROMA444; - int i_height = h->param.i_height >> shift; + int h_shift = i && h->mb.chroma_h_shift; + int v_shift = i && h->mb.chroma_v_shift; + int i_height = h->param.i_height >> v_shift; int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width); - int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift; + int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift; if( i_padx ) { for( int y = 0; y < i_height; y++ ) pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], - &frame->plane[i][y*frame->i_stride[i] + i_width - 1-shift], - i_padx>>shift, sizeof(pixel)<plane[i][y*frame->i_stride[i] + i_width - 1-h_shift], + i_padx>>h_shift, sizeof(pixel)<fenc->i_plane; i++ ) { - int shift = i && !CHROMA444; + int v_shift = i && h->mb.chroma_v_shift; int stride = h->fenc->i_stride[i]; - int height = h->param.i_height >> shift; - int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift; + int height = h->param.i_height >> v_shift; + int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift; pixel *fenc = h->fenc->plane[i] + 16*mb_x; for( int y = height; y < height + pady; y++ ) memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*sizeof(pixel) ); diff --git a/common/frame.h b/common/frame.h index 77af60d1..a13e05b4 100644 --- a/common/frame.h +++ b/common/frame.h @@ -181,12 +181,20 @@ typedef struct { x264_deblock_inter_t deblock_luma[2]; x264_deblock_inter_t deblock_chroma[2]; + x264_deblock_inter_t deblock_h_chroma_420; + x264_deblock_inter_t deblock_h_chroma_422; x264_deblock_intra_t deblock_luma_intra[2]; x264_deblock_intra_t deblock_chroma_intra[2]; + x264_deblock_intra_t deblock_h_chroma_420_intra; + x264_deblock_intra_t deblock_h_chroma_422_intra; x264_deblock_inter_t deblock_luma_mbaff; x264_deblock_inter_t deblock_chroma_mbaff; + x264_deblock_inter_t deblock_chroma_420_mbaff; + x264_deblock_inter_t deblock_chroma_422_mbaff; x264_deblock_intra_t deblock_luma_intra_mbaff; x264_deblock_intra_t deblock_chroma_intra_mbaff; + x264_deblock_intra_t deblock_chroma_420_intra_mbaff; + x264_deblock_intra_t deblock_chroma_422_intra_mbaff; void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); diff --git a/common/macroblock.c b/common/macroblock.c index 7c524ff0..f985e772 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -6,6 +6,7 @@ * Authors: Fiona Glaser * Laurent Aimar * Loren Merritt + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -50,23 +51,27 @@ static NOINLINE void x264_mb_mc_0xywh( x264_t *h, int x, int y, int width, int h } else { - // chroma is offset if MCing from a field of opposite parity - if( MB_INTERLACED & i_ref ) + int v_shift = h->mb.chroma_v_shift; + // Chroma in 4:2:0 is offset if MCing from a field of opposite parity + if( v_shift & MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], - &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; + height = 4*height >> v_shift; + + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset], + &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1], - mvx, mvy, 2*width, 2*height ); + mvx, 2*mvy>>v_shift, 2*width, height ); if( h->sh.weight[i_ref][1].weightfn ) - h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->sh.weight[i_ref][1], height*2 ); + h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, + &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, + &h->sh.weight[i_ref][1], height ); if( h->sh.weight[i_ref][2].weightfn ) - h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, - &h->sh.weight[i_ref][2],height*2 ); + h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, + &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, + &h->sh.weight[i_ref][2], height ); } } static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height ) @@ -85,13 +90,15 @@ static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int h } else { - if( MB_INTERLACED & i_ref ) + int v_shift = h->mb.chroma_v_shift; + if( v_shift & MB_INTERLACED & i_ref ) mvy += (h->mb.i_mb_y & 1)*4 - 2; - h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], - &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, + int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; + h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset], + &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1], - mvx, mvy, 2*width, 2*height ); + mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift ); } } @@ -128,17 +135,21 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int } else { - if( MB_INTERLACED & i_ref0 ) + int v_shift = h->mb.chroma_v_shift; + if( v_shift & MB_INTERLACED & i_ref0 ) mvy0 += (h->mb.i_mb_y & 1)*4 - 2; - if( MB_INTERLACED & i_ref1 ) + if( v_shift & MB_INTERLACED & i_ref1 ) mvy1 += (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1], - mvx0, mvy0, 2*width, 2*height ); + mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift ); h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1], - mvx1, mvy1, 2*width, 2*height ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); - h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); + mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift ); + + int chromapix = h->luma2chroma_pixel[i_mode]; + int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x; + h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight ); + h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight ); } } @@ -301,7 +312,9 @@ int x264_macroblock_cache_allocate( x264_t *h ) } else { - luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*16+2*i_padv); + /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4 + * needs the same amount of space and 4:2:2 needs twice that much */ + luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv); if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART ) //smart can weight one ref and one offset -1 in 8-bit @@ -491,6 +504,24 @@ void x264_macroblock_thread_init( x264_t *h ) (h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I); h->mb.i_mb_prev_xy = -1; + /* 4:2:0 4:2:2 4:4:4 + * fdec fenc fdec fenc fdec fenc + * y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y + * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y + * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y + * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y + * y Y Y Y Y U U V V y Y Y Y Y U U V V y Y Y Y Y U U U U + * u u u v v v U U V V u u u v v v U U V V u u u u u u u U U U U + * u U U v V V u U U v V V U U V V u U U U U U U U U + * u U U v V V u U U v V V U U V V u U U U U U U U U + * u U U v V V u U U U U V V V V + * u U U v V V u U U U U V V V V + * v v v v v v v V V V V + * v V V V V V V V V + * v V V V V + * v V V V V + * v V V V V + */ h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE; h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE; @@ -500,16 +531,6 @@ void x264_macroblock_thread_init( x264_t *h ) h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE; h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE; } - /* fdec: fenc: - * yyyyyyy - * yYYYY YYYY - * yYYYY YYYY - * yYYYY YYYY - * yYYYY YYYY - * uuu vvv UUVV - * uUU vVV UUVV - * uUU vVV - */ else { h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8; @@ -522,7 +543,7 @@ void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y ) int stride_y = fenc->i_stride[0]; int stride_uv = fenc->i_stride[1]; int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y; - int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv; + int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> h->mb.chroma_v_shift); h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y, fenc->plane[1]+off_uv, stride_uv, i_mb_x ); } @@ -537,12 +558,12 @@ NOINLINE void x264_copy_column8( pixel *dst, pixel *src ) static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff ) { int mb_interlaced = b_mbaff && MB_INTERLACED; - int w = b_chroma ? 8 : 16; + int height = b_chroma ? 16 >> h->mb.chroma_v_shift : 16; int i_stride = h->fdec->i_stride[i]; int i_stride2 = i_stride << mb_interlaced; int i_pix_offset = mb_interlaced - ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride - : 16 * mb_x + w * mb_y * i_stride; + ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + height * mb_y * i_stride; pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset]; int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0; pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16]; @@ -554,7 +575,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset]; if( b_chroma ) { - h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 ); + h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height ); memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) ); memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) ); if( b_mbaff ) @@ -572,7 +593,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x } if( b_mbaff ) { - for( int j = 0; j < w; j++ ) + for( int j = 0; j < height; j++ ) if( b_chroma ) { h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2]; @@ -854,8 +875,8 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m /* load non_zero_count */ CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] ); - CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16+4 + 8*CHROMA444] ); - CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32+4 + 8*CHROMA444] ); + CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>h->mb.chroma_v_shift)] ); + CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>h->mb.chroma_v_shift)] ); /* Finish the prefetching */ for( int l = 0; l < lists; l++ ) @@ -906,16 +927,17 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]]; h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]]; - if( CHROMA444 ) + if( CHROMA_FORMAT >= CHROMA_422 ) { - h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16]; - h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16]; - h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16]; - h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16]; - h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32]; - h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32]; - h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32]; - h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32]; + int offset = (4>>h->mb.chroma_h_shift) - 4; + h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset]; + h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset]; + h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset]; + h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset]; + h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset]; + h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset]; + h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset]; + h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset]; } else { @@ -943,7 +965,7 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80; - if( CHROMA444 ) + if( CHROMA_FORMAT >= CHROMA_422 ) { h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = @@ -983,6 +1005,11 @@ static void ALWAYS_INLINE x264_macroblock_cache_load( x264_t *h, int mb_x, int m { x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE ); x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE ); + if( CHROMA_FORMAT == CHROMA_422 ) + { + x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE ); + x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE ); + } x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 ); } } @@ -1424,15 +1451,17 @@ void x264_macroblock_deblock_strength( x264_t *h ) } /* Early termination: in this case, nnz guarantees all edges use strength 2.*/ - if( h->mb.b_transform_8x8 && (h->mb.i_cbp_luma&7) == 7 && !CHROMA444 ) + if( h->mb.b_transform_8x8 && !CHROMA444 ) { - M32( bs[0][0] ) = 0x02020202; - M32( bs[0][2] ) = 0x02020202; - M32( bs[0][4] ) = 0x02020202; - M32( bs[1][0] ) = 0x02020202; - M32( bs[1][2] ) = 0x02020202; - M32( bs[1][4] ) = 0x02020202; - return; + int cbp_mask = 0xf >> h->mb.chroma_v_shift; + if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask ) + { + M32( bs[0][0] ) = 0x02020202; + M32( bs[0][2] ) = 0x02020202; + M32( bs[0][4] ) = 0x02020202; + memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */ + return; + } } int neighbour_changed = 0; @@ -1595,14 +1624,14 @@ void x264_macroblock_deblock_strength( x264_t *h ) static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff ) { - int w = b_chroma ? 8 : 16; + int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16; int i_stride = h->fdec->i_stride[i]; int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED); int i_pix_offset = (b_mbaff && MB_INTERLACED) - ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride - : 16 * mb_x + w * mb_y * i_stride; + ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride + : 16 * mb_x + height * mb_y * i_stride; if( b_chroma ) - h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] ); + h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height ); else h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 ); } @@ -1622,8 +1651,9 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int } else { - memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) ); - memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) ); + int backup_src = (15>>h->mb.chroma_v_shift) * FDEC_STRIDE; + memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) ); + memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) ); } if( b_mbaff ) { @@ -1639,7 +1669,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int } else { - backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE; + if( CHROMA_FORMAT == CHROMA_420 ) + backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE; memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) ); memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) ); } @@ -1650,8 +1681,8 @@ static void ALWAYS_INLINE x264_macroblock_backup_intra( x264_t *h, int mb_x, int /* In progressive we update intra_border_backup in-place, so the topleft neighbor will * no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */ h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15]; - h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7 + 8*CHROMA444]; - h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7 + 8*CHROMA444]; + h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)]; + h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)]; } } @@ -1744,7 +1775,7 @@ void x264_macroblock_cache_save( x264_t *h ) CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] ); CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] ); CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] ); - if( CHROMA444 ) + if( CHROMA_FORMAT >= CHROMA_422 ) { CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ); CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] ); @@ -1809,7 +1840,7 @@ void x264_macroblock_cache_save( x264_t *h ) uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy]; uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy]; if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM ) - h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ]; + h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]; else h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC; diff --git a/common/macroblock.h b/common/macroblock.h index 7f5d5661..12b90c62 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -397,15 +397,6 @@ static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b ) # define pack_pixel_2to4 pack16to32 #endif -#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef)) -#define array_non_zero_int array_non_zero_int -static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count ) -{ - for( int i = 0; i < i_count; i++ ) - if( v[i] ) - return 1; - return 0; -} static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx ) { const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1]; diff --git a/common/mc.c b/common/mc.c index 5352be14..c2b77f58 100644 --- a/common/mc.c +++ b/common/mc.c @@ -90,9 +90,11 @@ PIXEL_AVG_C( pixel_avg_16x8, 16, 8 ) PIXEL_AVG_C( pixel_avg_8x16, 8, 16 ) PIXEL_AVG_C( pixel_avg_8x8, 8, 8 ) PIXEL_AVG_C( pixel_avg_8x4, 8, 4 ) +PIXEL_AVG_C( pixel_avg_4x16, 4, 16 ) PIXEL_AVG_C( pixel_avg_4x8, 4, 8 ) PIXEL_AVG_C( pixel_avg_4x4, 4, 4 ) PIXEL_AVG_C( pixel_avg_4x2, 4, 2 ) +PIXEL_AVG_C( pixel_avg_2x8, 2, 8 ) PIXEL_AVG_C( pixel_avg_2x4, 2, 4 ) PIXEL_AVG_C( pixel_avg_2x2, 2, 2 ) @@ -330,9 +332,9 @@ void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta, } } -static void store_interleave_8x8x2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv ) +static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ) { - for( int y=0; y<8; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE ) + for( int y=0; ymc_luma = mc_luma; pf->get_ref = get_ref; + pf->mc_chroma = mc_chroma; pf->avg[PIXEL_16x16]= pixel_avg_16x16; @@ -474,9 +477,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_8x16] = pixel_avg_8x16; pf->avg[PIXEL_8x8] = pixel_avg_8x8; pf->avg[PIXEL_8x4] = pixel_avg_8x4; + pf->avg[PIXEL_4x16] = pixel_avg_4x16; pf->avg[PIXEL_4x8] = pixel_avg_4x8; pf->avg[PIXEL_4x4] = pixel_avg_4x4; pf->avg[PIXEL_4x2] = pixel_avg_4x2; + pf->avg[PIXEL_2x8] = pixel_avg_2x8; pf->avg[PIXEL_2x4] = pixel_avg_2x4; pf->avg[PIXEL_2x2] = pixel_avg_2x2; @@ -490,9 +495,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_8x8] = mc_copy_w8; pf->copy[PIXEL_4x4] = mc_copy_w4; - pf->store_interleave_8x8x2 = store_interleave_8x8x2; - pf->load_deinterleave_8x8x2_fenc = load_deinterleave_8x8x2_fenc; - pf->load_deinterleave_8x8x2_fdec = load_deinterleave_8x8x2_fdec; + pf->store_interleave_chroma = store_interleave_chroma; + pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc; + pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec; pf->plane_copy = x264_plane_copy_c; pf->plane_copy_interleave = x264_plane_copy_interleave_c; diff --git a/common/mc.h b/common/mc.h index 15a0a254..09dda557 100644 --- a/common/mc.h +++ b/common/mc.h @@ -62,30 +62,27 @@ extern const x264_weight_t x264_weight_none[3]; typedef struct { - void (*mc_luma)(pixel *dst, int i_dst, pixel **src, int i_src, - int mvx, int mvy, - int i_width, int i_height, const x264_weight_t *weight ); + void (*mc_luma)( pixel *dst, int i_dst, pixel **src, int i_src, + int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ); /* may round up the dimensions if they're not a power of 2 */ - pixel* (*get_ref)(pixel *dst, int *i_dst, pixel **src, int i_src, - int mvx, int mvy, - int i_width, int i_height, const x264_weight_t *weight ); + pixel* (*get_ref)( pixel *dst, int *i_dst, pixel **src, int i_src, + int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ); /* mc_chroma may write up to 2 bytes of garbage to the right of dst, * so it must be run from left to right. */ - void (*mc_chroma)(pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src, - int mvx, int mvy, - int i_width, int i_height ); + void (*mc_chroma)( pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src, + int mvx, int mvy, int i_width, int i_height ); - void (*avg[10])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight ); + void (*avg[12])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight ); /* only 16x16, 8x8, and 4x4 defined */ void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height ); void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height ); - void (*store_interleave_8x8x2)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv ); - void (*load_deinterleave_8x8x2_fenc)( pixel *dst, pixel *src, int i_src ); - void (*load_deinterleave_8x8x2_fdec)( pixel *dst, pixel *src, int i_src ); + void (*store_interleave_chroma)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ); + void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, int i_src, int height ); + void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, int i_src, int height ); void (*plane_copy)( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h ); diff --git a/common/pixel.c b/common/pixel.c index 91dc1b87..b346681b 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -68,10 +68,10 @@ PIXEL_SAD_C( x264_pixel_sad_16x8, 16, 8 ) PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 ) PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 ) PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 ) +PIXEL_SAD_C( x264_pixel_sad_4x16, 4, 16 ) PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 ) PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 ) - /**************************************************************************** * pixel_ssd_WxH ****************************************************************************/ @@ -98,6 +98,7 @@ PIXEL_SSD_C( x264_pixel_ssd_16x8, 16, 8 ) PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 ) PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 ) PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 ) +PIXEL_SSD_C( x264_pixel_ssd_4x16, 4, 16 ) PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 ) PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 ) @@ -169,11 +170,11 @@ void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pi /**************************************************************************** * pixel_var_wxh ****************************************************************************/ -#define PIXEL_VAR_C( name, w ) \ +#define PIXEL_VAR_C( name, w, h ) \ static uint64_t name( pixel *pix, int i_stride ) \ { \ uint32_t sum = 0, sqr = 0; \ - for( int y = 0; y < w; y++ ) \ + for( int y = 0; y < h; y++ ) \ { \ for( int x = 0; x < w; x++ ) \ { \ @@ -185,32 +186,37 @@ static uint64_t name( pixel *pix, int i_stride ) \ return sum + ((uint64_t)sqr << 32); \ } -PIXEL_VAR_C( x264_pixel_var_16x16, 16 ) -PIXEL_VAR_C( x264_pixel_var_8x8, 8 ) +PIXEL_VAR_C( x264_pixel_var_16x16, 16, 16 ) +PIXEL_VAR_C( x264_pixel_var_8x16, 8, 16 ) +PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 ) /**************************************************************************** * pixel_var2_wxh ****************************************************************************/ -static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) -{ - uint32_t var = 0, sum = 0, sqr = 0; - for( int y = 0; y < 8; y++ ) - { - for( int x = 0; x < 8; x++ ) - { - int diff = pix1[x] - pix2[x]; - sum += diff; - sqr += diff * diff; - } - pix1 += i_stride1; - pix2 += i_stride2; - } - sum = abs(sum); - var = sqr - ((uint64_t)sum * sum >> 6); - *ssd = sqr; - return var; +#define PIXEL_VAR2_C( name, w, h ) \ +static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \ +{ \ + uint32_t var = 0, sum = 0, sqr = 0; \ + for( int y = 0; y < h; y++ ) \ + { \ + for( int x = 0; x < w; x++ ) \ + { \ + int diff = pix1[x] - pix2[x]; \ + sum += diff; \ + sqr += diff * diff; \ + } \ + pix1 += i_stride1; \ + pix2 += i_stride2; \ + } \ + sum = abs(sum); \ + var = sqr - ((uint64_t)sum * sum >> 6); \ + *ssd = sqr; \ + return var; \ } +PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16 ) +PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8 ) + #if BIT_DEPTH > 8 typedef uint32_t sum_t; typedef uint64_t sum2_t; @@ -309,9 +315,9 @@ PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 16, 8, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 8, 16, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 8, 8, x264_pixel_satd_8x4 ) +PIXEL_SATD_C( 4, 16, x264_pixel_satd_4x4 ) PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 ) - static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) { sum2_t tmp[8][4]; @@ -535,6 +541,8 @@ INTRA_MBCMP( sad, 4x4, v, h, dc, ,, _c ) INTRA_MBCMP(satd, 4x4, v, h, dc, ,, _c ) INTRA_MBCMP( sad, 8x8, dc, h, v, c,, _c ) INTRA_MBCMP(satd, 8x8, dc, h, v, c,, _c ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c,, _c ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c ) INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c ) @@ -754,23 +762,27 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #define INIT7_NAME( name1, name2, cpu ) \ INIT6_NAME( name1, name2, cpu ) \ pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu; +#define INIT8_NAME( name1, name2, cpu ) \ + INIT7_NAME( name1, name2, cpu ) \ + pixf->name1[PIXEL_4x16] = x264_pixel_##name2##_4x16##cpu; #define INIT2( name, cpu ) INIT2_NAME( name, name, cpu ) #define INIT4( name, cpu ) INIT4_NAME( name, name, cpu ) #define INIT5( name, cpu ) INIT5_NAME( name, name, cpu ) #define INIT6( name, cpu ) INIT6_NAME( name, name, cpu ) #define INIT7( name, cpu ) INIT7_NAME( name, name, cpu ) +#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu ) #define INIT_ADS( cpu ) \ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\ pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu; - INIT7( sad, ); - INIT7_NAME( sad_aligned, sad, ); + INIT8( sad, ); + INIT8_NAME( sad_aligned, sad, ); INIT7( sad_x3, ); INIT7( sad_x4, ); - INIT7( ssd, ); - INIT7( satd, ); + INIT8( ssd, ); + INIT8( satd, ); INIT7( satd_x3, ); INIT7( satd_x4, ); INIT4( hadamard_ac, ); @@ -779,12 +791,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8; pixf->ssd_nv12_core = pixel_ssd_nv12_core; pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; - pixf->var2_8x8 = pixel_var2_8x8; pixf->vsad = pixel_vsad; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4; @@ -793,6 +807,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8; pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c; pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16; @@ -813,7 +829,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; - pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; @@ -837,8 +853,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2; pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; - pixf->var2_8x8 = x264_pixel_var2_8x8_sse2; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; } if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) ) { @@ -937,7 +953,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2; - pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; pixf->vsad = x264_pixel_vsad_mmx2; if( cpu&X264_CPU_CACHELINE_32 ) @@ -986,7 +1002,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif - pixf->var2_8x8 = x264_pixel_var2_8x8_sse2; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2; pixf->vsad = x264_pixel_vsad_sse2; } @@ -1072,7 +1088,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #if ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3; #endif - pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3; if( cpu&X264_CPU_SHUFFLE_IS_FAST ) pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) @@ -1154,7 +1170,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; - pixf->var2_8x8 = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; diff --git a/common/pixel.h b/common/pixel.h index c7ee0fbf..d2ea52f5 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -5,6 +5,7 @@ * * Authors: Loren Merritt * Fiona Glaser + Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -42,22 +43,19 @@ enum PIXEL_8x4 = 4, PIXEL_4x8 = 5, PIXEL_4x4 = 6, - PIXEL_4x2 = 7, - PIXEL_2x4 = 8, - PIXEL_2x2 = 9, + + /* Subsampled chroma only */ + PIXEL_4x16 = 7, /* 4:2:2 */ + PIXEL_4x2 = 8, + PIXEL_2x8 = 9, /* 4:2:2 */ + PIXEL_2x4 = 10, + PIXEL_2x2 = 11, }; -static const struct -{ - int w; - int h; -} x264_pixel_size[7] = +static const struct { uint8_t w, h; } x264_pixel_size[12] = { - { 16, 16 }, - { 16, 8 }, { 8, 16 }, - { 8, 8 }, - { 8, 4 }, { 4, 8 }, - { 4, 4 } + { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, + { 4, 16 }, { 4, 2 }, { 2, 8 }, { 2, 4 }, { 2, 2 }, }; static const uint8_t x264_size2pixel[5][5] = @@ -69,23 +67,32 @@ static const uint8_t x264_size2pixel[5][5] = { 0, 0, PIXEL_8x16, 0, PIXEL_16x16 } }; +static const uint8_t x264_luma2chroma_pixel[4][7] = +{ + { 0 }, + { PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4, PIXEL_4x2, PIXEL_2x4, PIXEL_2x2 }, /* 4:2:0 */ + { PIXEL_8x16, PIXEL_8x8, PIXEL_4x16, PIXEL_4x8, PIXEL_4x4, PIXEL_2x8, PIXEL_2x4 }, /* 4:2:2 */ + { PIXEL_16x16, PIXEL_16x8, PIXEL_8x16, PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 }, /* 4:4:4 */ +}; + typedef struct { - x264_pixel_cmp_t sad[7]; - x264_pixel_cmp_t ssd[7]; - x264_pixel_cmp_t satd[7]; + x264_pixel_cmp_t sad[8]; + x264_pixel_cmp_t ssd[8]; + x264_pixel_cmp_t satd[8]; x264_pixel_cmp_t ssim[7]; x264_pixel_cmp_t sa8d[4]; - x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */ - x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */ - x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */ + x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */ + x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */ + x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */ x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; - x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */ + x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */ int (*vsad)( pixel *, int, int ); - int (*var2_8x8)( pixel *, int, pixel *, int, int * ); uint64_t (*var[4])( pixel *pix, int stride ); + int (*var2[4])( pixel *pix1, int stride1, + pixel *pix2, int stride2, int *ssd ); uint64_t (*hadamard_ac[4])( pixel *pix, int stride ); void (*ssd_nv12_core)( pixel *pixuv1, int stride1, @@ -110,12 +117,18 @@ typedef struct void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_sad_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] ); - void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); - void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); - void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_satd_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_sad_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_mbcmp_x3_chroma)( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_satd_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_sad_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_mbcmp_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_satd_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_sad_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); + void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] ); void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] ); diff --git a/common/predict.c b/common/predict.c index 34798c2f..f5ed6426 100644 --- a/common/predict.c +++ b/common/predict.c @@ -6,6 +6,7 @@ * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -151,7 +152,7 @@ void x264_predict_16x16_p_c( pixel *src ) /**************************************************************************** - * 8x8 prediction for intra chroma block + * 8x8 prediction for intra chroma block (4:2:0) ****************************************************************************/ static void x264_predict_8x8c_dc_128_c( pixel *src ) @@ -297,6 +298,167 @@ void x264_predict_8x8c_p_c( pixel *src ) } } +/**************************************************************************** + * 8x16 prediction for intra chroma block (4:2:2) + ****************************************************************************/ + +static void x264_predict_8x16c_dc_128_c( pixel *src ) +{ + for( int y = 0; y < 16; y++ ) + { + MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); + MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ); + src += FDEC_STRIDE; + } +} +static void x264_predict_8x16c_dc_left_c( pixel *src ) +{ + for( int i = 0; i < 4; i++ ) + { + int dc = 0; + + for( int y = 0; y < 4; y++ ) + dc += src[y*FDEC_STRIDE - 1]; + + pixel4 dcsplat = PIXEL_SPLAT_X4( (dc + 2) >> 2 ); + + for( int y = 0; y < 4; y++ ) + { + MPIXEL_X4( src+0 ) = dcsplat; + MPIXEL_X4( src+4 ) = dcsplat; + src += FDEC_STRIDE; + } + } +} +static void x264_predict_8x16c_dc_top_c( pixel *src ) +{ + int dc0 = 0, dc1 = 0; + + for(int x = 0; x < 4; x++ ) + { + dc0 += src[x - FDEC_STRIDE]; + dc1 += src[x + 4 - FDEC_STRIDE]; + } + pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 ); + pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 ); + + for( int y = 0; y < 16; y++ ) + { + MPIXEL_X4( src+0 ) = dc0splat; + MPIXEL_X4( src+4 ) = dc1splat; + src += FDEC_STRIDE; + } +} +void x264_predict_8x16c_dc_c( pixel *src ) +{ + int s0 = 0, s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0; + + /* + s0 s1 + s2 + s3 + s4 + s5 + */ + for( int i = 0; i < 4; i++ ) + { + s0 += src[i+0 - FDEC_STRIDE]; + s1 += src[i+4 - FDEC_STRIDE]; + s2 += src[-1 + (i+0) * FDEC_STRIDE]; + s3 += src[-1 + (i+4) * FDEC_STRIDE]; + s4 += src[-1 + (i+8) * FDEC_STRIDE]; + s5 += src[-1 + (i+12) * FDEC_STRIDE]; + } + /* + dc0 dc1 + dc2 dc3 + dc4 dc5 + dc6 dc7 + */ + pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 ); + pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 ); + pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 ); + pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 ); + pixel4 dc4 = PIXEL_SPLAT_X4( ( s4 + 2 ) >> 2 ); + pixel4 dc5 = PIXEL_SPLAT_X4( ( s1 + s4 + 4 ) >> 3 ); + pixel4 dc6 = PIXEL_SPLAT_X4( ( s5 + 2 ) >> 2 ); + pixel4 dc7 = PIXEL_SPLAT_X4( ( s1 + s5 + 4 ) >> 3 ); + + for( int y = 0; y < 4; y++ ) + { + MPIXEL_X4( src+0 ) = dc0; + MPIXEL_X4( src+4 ) = dc1; + src += FDEC_STRIDE; + } + for( int y = 0; y < 4; y++ ) + { + MPIXEL_X4( src+0 ) = dc2; + MPIXEL_X4( src+4 ) = dc3; + src += FDEC_STRIDE; + } + for( int y = 0; y < 4; y++ ) + { + MPIXEL_X4( src+0 ) = dc4; + MPIXEL_X4( src+4 ) = dc5; + src += FDEC_STRIDE; + } + for( int y = 0; y < 4; y++ ) + { + MPIXEL_X4( src+0 ) = dc6; + MPIXEL_X4( src+4 ) = dc7; + src += FDEC_STRIDE; + } +} +void x264_predict_8x16c_h_c( pixel *src ) +{ + for( int i = 0; i < 16; i++ ) + { + pixel4 v = PIXEL_SPLAT_X4( src[-1] ); + MPIXEL_X4( src+0 ) = v; + MPIXEL_X4( src+4 ) = v; + src += FDEC_STRIDE; + } +} +void x264_predict_8x16c_v_c( pixel *src ) +{ + pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE ); + pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE ); + + for( int i = 0; i < 16; i++ ) + { + MPIXEL_X4( src+0 ) = v0; + MPIXEL_X4( src+4 ) = v1; + src += FDEC_STRIDE; + } +} +void x264_predict_8x16c_p_c( pixel *src ) +{ + int H = 0; + int V = 0; + + for( int i = 0; i < 4; i++ ) + H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] ); + for( int i = 0; i < 8; i++ ) + V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] ); + + int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] ); + int b = ( 17 * H + 16 ) >> 5; + int c = ( 5 * V + 32 ) >> 6; + int i00 = a -3*b -7*c + 16; + + for( int y = 0; y < 16; y++ ) + { + int pix = i00; + for( int x = 0; x < 8; x++ ) + { + src[x] = x264_clip_pixel( pix>>5 ); + pix += b; + } + src += FDEC_STRIDE; + i00 += c; + } +} + /**************************************************************************** * 4x4 prediction for intra luma block ****************************************************************************/ @@ -762,6 +924,17 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) #endif } +void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] ) +{ + pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_c; + pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_c; + pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_c; + pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_c; + pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c; + pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c; + pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c; +} + void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) { pf[I_PRED_8x8_V] = x264_predict_8x8_v_c; diff --git a/common/predict.h b/common/predict.h index 23330f51..8ceb5773 100644 --- a/common/predict.h +++ b/common/predict.h @@ -42,7 +42,7 @@ enum intra_chroma_pred_e I_PRED_CHROMA_DC_TOP = 5, I_PRED_CHROMA_DC_128 = 6 }; -static const uint8_t x264_mb_pred_mode8x8c_fix[7] = +static const uint8_t x264_mb_chroma_pred_mode_fix[7] = { I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P, I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC @@ -123,9 +123,14 @@ void x264_predict_8x8c_dc_c ( pixel *src ); void x264_predict_8x8c_h_c ( pixel *src ); void x264_predict_8x8c_v_c ( pixel *src ); void x264_predict_8x8c_p_c ( pixel *src ); +void x264_predict_8x16c_dc_c( pixel *src ); +void x264_predict_8x16c_h_c ( pixel *src ); +void x264_predict_8x16c_v_c ( pixel *src ); +void x264_predict_8x16c_p_c ( pixel *src ); void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] ); void x264_predict_8x8c_init ( int cpu, x264_predict_t pf[7] ); +void x264_predict_8x16c_init ( int cpu, x264_predict_t pf[7] ); void x264_predict_4x4_init ( int cpu, x264_predict_t pf[12] ); void x264_predict_8x8_init ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); diff --git a/common/quant.c b/common/quant.c index 5be7f57f..db9d57a8 100644 --- a/common/quant.c +++ b/common/quant.c @@ -6,6 +6,7 @@ * Authors: Loren Merritt * Fiona Glaser * Christian Heine + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -141,54 +142,121 @@ static void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp ) } } -static ALWAYS_INLINE void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf ) +#define IDCT_DEQUANT_2X4_START \ + int a0 = dct[0] + dct[1]; \ + int a1 = dct[2] + dct[3]; \ + int a2 = dct[4] + dct[5]; \ + int a3 = dct[6] + dct[7]; \ + int a4 = dct[0] - dct[1]; \ + int a5 = dct[2] - dct[3]; \ + int a6 = dct[4] - dct[5]; \ + int a7 = dct[6] - dct[7]; \ + int b0 = a0 + a1; \ + int b1 = a2 + a3; \ + int b2 = a4 + a5; \ + int b3 = a6 + a7; \ + int b4 = a0 - a1; \ + int b5 = a2 - a3; \ + int b6 = a4 - a5; \ + int b7 = a6 - a7; + +static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ) +{ + IDCT_DEQUANT_2X4_START + int dmf = dequant_mf[i_qp%6][0] << i_qp/6; + dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6; + dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6; + dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6; + dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6; + dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6; + dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6; + dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6; + dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6; +} + +static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp ) +{ + IDCT_DEQUANT_2X4_START + int dmf = dequant_mf[i_qp%6][0] << i_qp/6; + dct[0] = ((b0 + b1) * dmf + 32) >> 6; + dct[1] = ((b2 + b3) * dmf + 32) >> 6; + dct[2] = ((b0 - b1) * dmf + 32) >> 6; + dct[3] = ((b2 - b3) * dmf + 32) >> 6; + dct[4] = ((b4 - b5) * dmf + 32) >> 6; + dct[5] = ((b6 - b7) * dmf + 32) >> 6; + dct[6] = ((b4 + b5) * dmf + 32) >> 6; + dct[7] = ((b6 + b7) * dmf + 32) >> 6; +} + +static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf ) +{ + IDCT_DEQUANT_2X4_START + out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */ + out[1] = ((b2 + b3) * dmf + 2080) >> 6; + out[2] = ((b0 - b1) * dmf + 2080) >> 6; + out[3] = ((b2 - b3) * dmf + 2080) >> 6; + out[4] = ((b4 - b5) * dmf + 2080) >> 6; + out[5] = ((b6 - b7) * dmf + 2080) >> 6; + out[6] = ((b4 + b5) * dmf + 2080) >> 6; + out[7] = ((b6 + b7) * dmf + 2080) >> 6; +} +#undef IDCT_DEQUANT_2X4_START + +static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf ) { int d0 = dct[0] + dct[1]; int d1 = dct[2] + dct[3]; int d2 = dct[0] - dct[1]; int d3 = dct[2] - dct[3]; - out[0] = (d0 + d1) * dequant_mf >> 5; - out[1] = (d0 - d1) * dequant_mf >> 5; - out[2] = (d2 + d3) * dequant_mf >> 5; - out[3] = (d2 - d3) * dequant_mf >> 5; + out[0] = ((d0 + d1) * dmf >> 5) + 32; + out[1] = ((d0 - d1) * dmf >> 5) + 32; + out[2] = ((d2 + d3) * dmf >> 5) + 32; + out[3] = ((d2 - d3) * dmf >> 5) + 32; } -static ALWAYS_INLINE int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf ) +static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 ) { - dctcoef out[4]; - idct_dequant_2x2_dconly( out, dct, dequant_mf ); - return ((ref[0] ^ (out[0]+32)) - | (ref[1] ^ (out[1]+32)) - | (ref[2] ^ (out[2]+32)) - | (ref[3] ^ (out[3]+32))) >> 6; + dctcoef out[8]; + + if( chroma422 ) + optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf ); + else + optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf ); + + int sum = 0; + for( int i = 0; i < (chroma422?8:4); i++ ) + sum |= ref[i] ^ out[i]; + return sum >> 6; } -static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf ) +static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 ) { /* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */ - dctcoef dct_orig[4]; + dctcoef dct_orig[8]; int coeff, nz; - idct_dequant_2x2_dconly( dct_orig, dct, dequant_mf ); - dct_orig[0] += 32; - dct_orig[1] += 32; - dct_orig[2] += 32; - dct_orig[3] += 32; + if( chroma422 ) + optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf ); + else + optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf ); /* If the DC coefficients already round to zero, terminate early. */ - if( !((dct_orig[0]|dct_orig[1]|dct_orig[2]|dct_orig[3])>>6) ) + int sum = 0; + for( int i = 0; i < (chroma422?8:4); i++ ) + sum |= dct_orig[i]; + if( !(sum >> 6) ) return 0; /* Start with the highest frequency coefficient... is this the best option? */ - for( nz = 0, coeff = 3; coeff >= 0; coeff-- ) + for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- ) { int level = dct[coeff]; - int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */ + int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */ while( level ) { dct[coeff] = level - sign; - if( idct_dequant_round_2x2_dc( dct_orig, dct, dequant_mf ) ) + if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) ) { nz = 1; dct[coeff] = level; @@ -201,6 +269,16 @@ static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf ) return nz; } +static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf ) +{ + return optimize_chroma_dc_internal( dct, dequant_mf, 0 ); +} + +static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf ) +{ + return optimize_chroma_dc_internal( dct, dequant_mf, 1 ); +} + static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ) { for( int i = 0; i < size; i++ ) @@ -275,30 +353,20 @@ static int x264_decimate_score64( dctcoef *dct ) return x264_decimate_score_internal( dct, 64 ); } -static int ALWAYS_INLINE x264_coeff_last_internal( dctcoef *l, int i_count ) -{ - int i_last = i_count-1; - while( i_last >= 0 && l[i_last] == 0 ) - i_last--; - return i_last; +#define last(num)\ +static int x264_coeff_last##num( dctcoef *l )\ +{\ + int i_last = num-1;\ + while( i_last >= 0 && l[i_last] == 0 )\ + i_last--;\ + return i_last;\ } -static int x264_coeff_last4( dctcoef *l ) -{ - return x264_coeff_last_internal( l, 4 ); -} -static int x264_coeff_last15( dctcoef *l ) -{ - return x264_coeff_last_internal( l, 15 ); -} -static int x264_coeff_last16( dctcoef *l ) -{ - return x264_coeff_last_internal( l, 16 ); -} -static int x264_coeff_last64( dctcoef *l ) -{ - return x264_coeff_last_internal( l, 64 ); -} +last(4) +last(8) +last(15) +last(16) +last(64) #define level_run(num)\ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\ @@ -317,10 +385,10 @@ static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel ) } level_run(4) +level_run(8) level_run(15) level_run(16) - void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { pf->quant_8x8 = quant_8x8; @@ -332,18 +400,24 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4_dc = dequant_4x4_dc; pf->dequant_8x8 = dequant_8x8; - pf->optimize_chroma_dc = optimize_chroma_dc; + pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc; + pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly; + + pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc; + pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc; pf->denoise_dct = x264_denoise_dct; pf->decimate_score15 = x264_decimate_score15; pf->decimate_score16 = x264_decimate_score16; pf->decimate_score64 = x264_decimate_score64; - pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4; + pf->coeff_last4 = x264_coeff_last4; + pf->coeff_last8 = x264_coeff_last8; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64; - pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4; + pf->coeff_level_run4 = x264_coeff_level_run4; + pf->coeff_level_run8 = x264_coeff_level_run8; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16; @@ -361,16 +435,16 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz; } pf->decimate_score64 = x264_decimate_score64_mmx2; - pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2; + pf->coeff_last4 = x264_coeff_last4_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2; #endif - pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2; + pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; if( cpu&X264_CPU_LZCNT ) - pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; } if( cpu&X264_CPU_SSE2 ) { @@ -397,7 +471,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; if( cpu&X264_CPU_LZCNT ) { - pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt; + pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; @@ -471,12 +545,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2; pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2; #endif - pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2; - pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2; + pf->coeff_last4 = x264_coeff_last4_mmx2; + pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; if( cpu&X264_CPU_LZCNT ) { - pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt; - pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt; + pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; } } @@ -493,7 +567,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } - pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse2; + pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2; pf->denoise_dct = x264_denoise_dct_sse2; pf->decimate_score15 = x264_decimate_score15_sse2; pf->decimate_score16 = x264_decimate_score16_sse2; @@ -524,7 +598,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; - pf->optimize_chroma_dc = x264_optimize_chroma_dc_ssse3; + pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3; pf->denoise_dct = x264_denoise_dct_ssse3; pf->decimate_score15 = x264_decimate_score15_ssse3; pf->decimate_score16 = x264_decimate_score16_ssse3; @@ -541,7 +615,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; pf->quant_4x4 = x264_quant_4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; - pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse4; + pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4; } if( cpu&X264_CPU_AVX ) @@ -552,7 +626,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_avx; pf->dequant_8x8 = x264_dequant_8x8_avx; } - pf->optimize_chroma_dc = x264_optimize_chroma_dc_avx; + pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx; pf->denoise_dct = x264_denoise_dct_avx; } #endif // HAVE_MMX @@ -571,7 +645,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) - pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm; + pf->coeff_last4 = x264_coeff_last4_arm; if( cpu&X264_CPU_NEON ) { diff --git a/common/quant.h b/common/quant.h index 09364143..9ad5385a 100644 --- a/common/quant.h +++ b/common/quant.h @@ -38,7 +38,11 @@ typedef struct void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); - int (*optimize_chroma_dc)( dctcoef dct[4], int dequant_mf ); + void (*idct_dequant_2x4_dc)( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); + void (*idct_dequant_2x4_dconly)( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); + + int (*optimize_chroma_2x2_dc)( dctcoef dct[4], int dequant_mf ); + int (*optimize_chroma_2x4_dc)( dctcoef dct[8], int dequant_mf ); void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); @@ -46,7 +50,11 @@ typedef struct int (*decimate_score16)( dctcoef *dct ); int (*decimate_score64)( dctcoef *dct ); int (*coeff_last[14])( dctcoef *dct ); + int (*coeff_last4)( dctcoef *dct ); + int (*coeff_last8)( dctcoef *dct ); int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel ); + int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel ); + int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel ); } x264_quant_function_t; void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ); diff --git a/common/set.h b/common/set.h index 4bbfea6e..038dbd4c 100644 --- a/common/set.h +++ b/common/set.h @@ -35,10 +35,17 @@ enum profile_e PROFILE_HIGH = 100, PROFILE_HIGH10 = 110, PROFILE_HIGH422 = 122, - PROFILE_HIGH444 = 144, PROFILE_HIGH444_PREDICTIVE = 244, }; +enum chroma_format_e +{ + CHROMA_400 = 0, + CHROMA_420 = 1, + CHROMA_422 = 2, + CHROMA_444 = 3, +}; + enum cqm4_e { CQM_4IY = 0, diff --git a/common/vlc.c b/common/vlc.c index 1d002bbc..bd2fc52c 100644 --- a/common/vlc.c +++ b/common/vlc.c @@ -5,6 +5,7 @@ * * Authors: Laurent Aimar * Fiona Glaser + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,16 +27,19 @@ #include "common.h" -const vlc_t x264_coeff0_token[5] = +/* [nC] */ +const vlc_t x264_coeff0_token[6] = { { 0x1, 1 }, /* str=1 */ { 0x3, 2 }, /* str=11 */ { 0xf, 4 }, /* str=1111 */ { 0x3, 6 }, /* str=000011 */ { 0x1, 2 }, /* str=01 */ + { 0x1, 1 }, /* str=1 */ }; -const vlc_t x264_coeff_token[5][16][4] = +/* [nC][i_total_coeff-1][i_trailing] */ +const vlc_t x264_coeff_token[6][16][4] = { { /* table 0 */ { /* i_total 1 */ @@ -440,6 +444,53 @@ const vlc_t x264_coeff_token[5][16][4] = { 0x0, 7 }, /* str=0000000 */ }, }, + { /* table 5 */ + { /* i_total 1 */ + { 0xf, 7 }, /* str=0001111 */ + { 0x1, 2 }, /* str=01 */ + }, + { /* i_total 2 */ + { 0xe, 7 }, /* str=0001110 */ + { 0xd, 7 }, /* str=0001101 */ + { 0x1, 3 }, /* str=001 */ + }, + { /* i_total 3 */ + { 0x7, 9 }, /* str=000000111 */ + { 0xc, 7 }, /* str=0001100 */ + { 0xb, 7 }, /* str=0001011 */ + { 0x1, 5 }, /* str=00001 */ + }, + { /* i_total 4 */ + { 0x6, 9 }, /* str=000000110 */ + { 0x5, 9 }, /* str=000000101 */ + { 0xa, 7 }, /* str=0001010 */ + { 0x1, 6 }, /* str=000001 */ + }, + { /* i_total 5 */ + { 0x7, 10 }, /* str=0000000111 */ + { 0x6, 10 }, /* str=0000000110 */ + { 0x4, 9 }, /* str=000000100 */ + { 0x9, 7 }, /* str=0001001 */ + }, + { /* i_total 6 */ + { 0x7, 11 }, /* str=00000000111 */ + { 0x6, 11 }, /* str=00000000110 */ + { 0x5, 10 }, /* str=0000000101 */ + { 0x8, 7 }, /* str=0001000 */ + }, + { /* i_total 7 */ + { 0x7, 12 }, /* str=000000000111 */ + { 0x6, 12 }, /* str=000000000110 */ + { 0x5, 11 }, /* str=00000000101 */ + { 0x4, 10 }, /* str=0000000100 */ + }, + { /* i_total 8 */ + { 0x7, 13 }, /* str=0000000000111 */ + { 0x5, 12 }, /* str=000000000101 */ + { 0x4, 12 }, /* str=000000000100 */ + { 0x4, 11 }, /* str=00000000100 */ + }, + }, }; /* [i_total_coeff-1][i_total_zeros] */ @@ -613,7 +664,7 @@ const vlc_t x264_total_zeros[15][16] = }; /* [i_total_coeff-1][i_total_zeros] */ -const vlc_t x264_total_zeros_dc[3][4] = +const vlc_t x264_total_zeros_2x2_dc[3][4] = { { /* i_total 1 */ { 0x1, 1 }, /* str=1 */ @@ -632,7 +683,61 @@ const vlc_t x264_total_zeros_dc[3][4] = }, }; -/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */ +/* [i_total_coeff-1][i_total_zeros] */ +const vlc_t x264_total_zeros_2x4_dc[7][8] = +{ + { /* i_total 1 */ + { 0x1, 1 }, /* str=1 */ + { 0x2, 3 }, /* str=010 */ + { 0x3, 3 }, /* str=011 */ + { 0x2, 4 }, /* str=0010 */ + { 0x3, 4 }, /* str=0011 */ + { 0x1, 4 }, /* str=0001 */ + { 0x1, 5 }, /* str=00001 */ + { 0x0, 5 }, /* str=00000 */ + }, + { /* i_total 2 */ + { 0x0, 3 }, /* str=000 */ + { 0x1, 2 }, /* str=01 */ + { 0x1, 3 }, /* str=001 */ + { 0x4, 3 }, /* str=100 */ + { 0x5, 3 }, /* str=101 */ + { 0x6, 3 }, /* str=110 */ + { 0x7, 3 }, /* str=111 */ + }, + { /* i_total 3 */ + { 0x0, 3 }, /* str=000 */ + { 0x1, 3 }, /* str=001 */ + { 0x1, 2 }, /* str=01 */ + { 0x2, 2 }, /* str=10 */ + { 0x6, 3 }, /* str=110 */ + { 0x7, 3 }, /* str=111 */ + }, + { /* i_total 4 */ + { 0x6, 3 }, /* str=110 */ + { 0x0, 2 }, /* str=00 */ + { 0x1, 2 }, /* str=01 */ + { 0x2, 2 }, /* str=10 */ + { 0x7, 3 }, /* str=111 */ + }, + { /* i_total 5 */ + { 0x0, 2 }, /* str=00 */ + { 0x1, 2 }, /* str=01 */ + { 0x2, 2 }, /* str=10 */ + { 0x3, 2 }, /* str=11 */ + }, + { /* i_total 6 */ + { 0x0, 2 }, /* str=00 */ + { 0x1, 2 }, /* str=01 */ + { 0x1, 1 }, /* str=1 */ + }, + { /* i_total 7 */ + { 0x0, 1 }, /* str=0 */ + { 0x1, 1 }, /* str=1 */ + } +}; + +/* [MIN( i_zero_left-1, 6 )][run_before] */ const vlc_t x264_run_before[7][16] = { { /* i_zero_left 1 */ @@ -674,7 +779,7 @@ const vlc_t x264_run_before[7][16] = { 0x5, 3 }, /* str=101 */ { 0x4, 3 }, /* str=100 */ }, - { /* i_zero_left 7 */ + { /* i_zero_left >6 */ { 0x7, 3 }, /* str=111 */ { 0x6, 3 }, /* str=110 */ { 0x5, 3 }, /* str=101 */ diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 24a5c3fa..f5c0d797 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1015,10 +1015,9 @@ cglobal plane_copy_interleave_core, 7,7 RET ;----------------------------------------------------------------------------- -; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv ) +; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height ) ;----------------------------------------------------------------------------- -cglobal store_interleave_8x8x2, 4,5 - mov r4d, 4 +cglobal store_interleave_chroma, 5,5 FIX_STRIDES r1d .loop: INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a @@ -1026,7 +1025,7 @@ cglobal store_interleave_8x8x2, 4,5 add r2, FDEC_STRIDEB*2 add r3, FDEC_STRIDEB*2 lea r0, [r0+r1*2] - dec r4d + sub r4d, 2 jg .loop REP_RET %endmacro ; PLANE_INTERLEAVE @@ -1076,34 +1075,32 @@ cglobal plane_copy_deinterleave, 6,7 REP_RET ;----------------------------------------------------------------------------- -; void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src ) +; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height ) ;----------------------------------------------------------------------------- -cglobal load_deinterleave_8x8x2_fenc, 3,4 +cglobal load_deinterleave_chroma_fenc, 4,4 DEINTERLEAVE_START - mov r3d, 4 FIX_STRIDES r2d .loop: DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a add r0, FENC_STRIDEB*2 lea r1, [r1+r2*2] - dec r3d + sub r3d, 2 jg .loop REP_RET ;----------------------------------------------------------------------------- -; void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src ) +; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height ) ;----------------------------------------------------------------------------- -cglobal load_deinterleave_8x8x2_fdec, 3,4 +cglobal load_deinterleave_chroma_fdec, 4,4 DEINTERLEAVE_START - mov r3d, 4 FIX_STRIDES r2d .loop: DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a add r0, FDEC_STRIDEB*2 lea r1, [r1+r2*2] - dec r3d + sub r3d, 2 jg .loop REP_RET %endmacro ; PLANE_DEINTERLEAVE diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 52e62d6e..6a730475 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -113,17 +113,17 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu, void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu, uint16_t *dstv, int i_dstv, uint16_t *src, int i_src, int w, int h ); -void x264_store_interleave_8x8x2_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv ); -void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv ); -void x264_store_interleave_8x8x2_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv ); -void x264_load_deinterleave_8x8x2_fenc_mmx( pixel *dst, pixel *src, int i_src ); -void x264_load_deinterleave_8x8x2_fenc_sse2( pixel *dst, pixel *src, int i_src ); -void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src ); -void x264_load_deinterleave_8x8x2_fenc_avx( uint16_t *dst, uint16_t *src, int i_src ); -void x264_load_deinterleave_8x8x2_fdec_mmx( pixel *dst, pixel *src, int i_src ); -void x264_load_deinterleave_8x8x2_fdec_sse2( pixel *dst, pixel *src, int i_src ); -void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src ); -void x264_load_deinterleave_8x8x2_fdec_avx( uint16_t *dst, uint16_t *src, int i_src ); +void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height ); +void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height ); +void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height ); +void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height ); +void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height ); +void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height ); +void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height ); +void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height ); void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); void x264_memzero_aligned_mmx( void * dst, int n ); @@ -497,8 +497,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_MMX) ) return; - pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx; - pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx; @@ -519,7 +519,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy = x264_plane_copy_mmx2; pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2; - pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2; + pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2; pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmx2; @@ -552,8 +552,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; - pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2; - pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; @@ -570,7 +570,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->integral_init4v = x264_integral_init4v_sse2; pf->integral_init8v = x264_integral_init8v_sse2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; - pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; + pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; pf->offsetadd = x264_mc_offsetadd_wtab_sse2; pf->offsetsub = x264_mc_offsetsub_wtab_sse2; @@ -603,11 +603,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( !(cpu&X264_CPU_AVX) ) return; - pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_avx; - pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_avx; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx; pf->plane_copy_interleave = x264_plane_copy_interleave_avx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx; - pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_avx; + pf->store_interleave_chroma = x264_store_interleave_chroma_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) pf->mc_chroma = x264_mc_chroma_avx; @@ -663,9 +663,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( cpu&X264_CPU_SSE2_IS_FAST ) { - pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium? - pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2; - pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2; + pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium? + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; pf->mc_luma = mc_luma_sse2; @@ -695,8 +695,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; - pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_ssse3; - pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_ssse3; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; pf->hpel_filter = x264_hpel_filter_ssse3; diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 215a7170..40f9ed58 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -670,10 +670,10 @@ DEQUANT_DC w, pmullw %endif ;----------------------------------------------------------------------------- -; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf ) +; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf ) ;----------------------------------------------------------------------------- -%macro OPTIMIZE_CHROMA_DC 0 +%macro OPTIMIZE_CHROMA_2x2_DC 0 %assign %%regs 5 %if cpuflag(sse4) %assign %%regs %%regs-1 @@ -681,7 +681,7 @@ DEQUANT_DC w, pmullw %ifndef ARCH_X86_64 %assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64 %endif -cglobal optimize_chroma_dc, 0,%%regs,7 +cglobal optimize_chroma_2x2_dc, 0,%%regs,7 movifnidn t0, r0mp movd m2, r1m movq m1, [t0] @@ -775,13 +775,13 @@ cglobal optimize_chroma_dc, 0,%%regs,7 %ifndef HIGH_BIT_DEPTH INIT_XMM sse2 -OPTIMIZE_CHROMA_DC +OPTIMIZE_CHROMA_2x2_DC INIT_XMM ssse3 -OPTIMIZE_CHROMA_DC +OPTIMIZE_CHROMA_2x2_DC INIT_XMM sse4 -OPTIMIZE_CHROMA_DC +OPTIMIZE_CHROMA_2x2_DC INIT_XMM avx -OPTIMIZE_CHROMA_DC +OPTIMIZE_CHROMA_2x2_DC %endif ; !HIGH_BIT_DEPTH %ifdef HIGH_BIT_DEPTH diff --git a/common/x86/quant.h b/common/x86/quant.h index 4abaea09..8b604720 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -57,10 +57,10 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_ void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); -int x264_optimize_chroma_dc_sse2( dctcoef dct[4], int dequant_mf ); -int x264_optimize_chroma_dc_ssse3( dctcoef dct[4], int dequant_mf ); -int x264_optimize_chroma_dc_sse4( dctcoef dct[4], int dequant_mf ); -int x264_optimize_chroma_dc_avx( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf ); +int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf ); void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); diff --git a/encoder/analyse.c b/encoder/analyse.c index 69de5174..b5b5a78d 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -103,8 +103,8 @@ typedef struct int i_satd_pcm; /* Chroma part */ - int i_satd_i8x8chroma; - int i_satd_i8x8chroma_dir[7]; + int i_satd_chroma; + int i_satd_chroma_dir[7]; int i_predict8x8chroma; /* II: Inter part P/B frame */ @@ -431,7 +431,7 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) a->i_satd_i16x16 = a->i_satd_i8x8 = a->i_satd_i4x4 = - a->i_satd_i8x8chroma = COST_MAX; + a->i_satd_chroma = COST_MAX; /* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */ a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX; @@ -607,7 +607,7 @@ static const int8_t i16x16_mode_available[5][5] = {I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1}, }; -static const int8_t i8x8chroma_mode_available[5][5] = +static const int8_t chroma_mode_available[5][5] = { {I_PRED_CHROMA_DC_128, -1, -1, -1, -1}, {I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1}, @@ -641,11 +641,11 @@ static ALWAYS_INLINE const int8_t *predict_16x16_mode_available( int i_neighbour return i16x16_mode_available[idx]; } -static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour ) +static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour ) { int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT); idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT); - return i8x8chroma_mode_available[idx]; + return chroma_mode_available[idx]; } static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i ) @@ -690,45 +690,46 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd ) static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) { - if( a->i_satd_i8x8chroma < COST_MAX ) + if( a->i_satd_chroma < COST_MAX ) return; if( CHROMA444 ) { if( !h->mb.b_chroma_me ) { - a->i_satd_i8x8chroma = 0; + a->i_satd_chroma = 0; return; } /* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */ h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] ); h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] ); - a->i_satd_i8x8chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) - + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); + a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) + + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); return; } - const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra ); + const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra ); + int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; - /* 8x8 prediction selection for chroma */ + /* Prediction selection for chroma */ if( predict_mode[3] >= 0 && !h->mb.b_lossless ) { int satdu[4], satdv[4]; - h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu ); - h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv ); - h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] ); - satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ); - satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); + h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu ); + h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv ); + h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] ); + h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] ); + satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ); + satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); for( ; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode ); - a->i_satd_i8x8chroma_dir[i_mode] = i_satd; - COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode ); + a->i_satd_chroma_dir[i_mode] = i_satd; + COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode ); } } else @@ -740,20 +741,20 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) /* we do the prediction */ if( h->mb.b_lossless ) - x264_predict_lossless_8x8_chroma( h, i_mode ); + x264_predict_lossless_chroma( h, i_mode ); else { - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } /* we calculate the cost */ - i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) + - h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) + - a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] ); + i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) + + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) + + a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] ); - a->i_satd_i8x8chroma_dir[i_mode] = i_satd; - COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode ); + a->i_satd_chroma_dir[i_mode] = i_satd; + COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode ); } } @@ -1110,17 +1111,17 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) /* RD selection for chroma prediction */ if( !CHROMA444 ) { - const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra ); + const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra ); if( predict_mode[1] >= 0 ) { int8_t predict_mode_sorted[4]; int i_max; - int i_thresh = a->b_early_terminate ? a->i_satd_i8x8chroma * 5/4 : COST_MAX; + int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX; for( i_max = 0; *predict_mode >= 0; predict_mode++ ) { int i_mode = *predict_mode; - if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma ) + if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma ) predict_mode_sorted[i_max++] = i_mode; } @@ -1131,21 +1132,21 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) /* the previous thing encoded was x264_intra_rd(), so the pixels and * coefs for the current chroma mode are still around, so we only * have to recount the bits. */ - i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 ); + i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 ); for( int i = 0; i < i_max; i++ ) { int i_mode = predict_mode_sorted[i]; if( h->mb.b_lossless ) - x264_predict_lossless_8x8_chroma( h, i_mode ); + x264_predict_lossless_chroma( h, i_mode ); else { - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } /* if we've already found a mode that needs no residual, then * probably any mode with a residual will be worse. * so avoid dct on the remaining modes to improve speed. */ - i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 ); + i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 ); COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma ); } h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; @@ -1273,14 +1274,13 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) #define LOAD_FENC(m, src, xoff, yoff) \ { \ - int s = !CHROMA444; \ (m)->p_cost_mv = a->p_cost_mv; \ (m)->i_stride[0] = h->mb.pic.i_stride[0]; \ (m)->i_stride[1] = h->mb.pic.i_stride[1]; \ (m)->i_stride[2] = h->mb.pic.i_stride[2]; \ (m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \ - (m)->p_fenc[1] = &(src)[1][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \ - (m)->p_fenc[2] = &(src)[2][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \ + (m)->p_fenc[1] = &(src)[1][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \ + (m)->p_fenc[2] = &(src)[2][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \ } #define LOAD_HPELS(m, src, list, ref, xoff, yoff) \ @@ -1301,7 +1301,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a ) (m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \ } \ else \ - (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \ + (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>h->mb.chroma_v_shift)*(m)->i_stride[1]]; \ (m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \ (m)->weight = x264_weight_none; \ (m)->i_ref = ref; \ @@ -1672,19 +1672,22 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost; } -static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size ) +static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a, + pixel **p_fref, int i8x8, int size, int chroma ) { - ALIGNED_ARRAY_16( pixel, pix1,[16*8] ); + ALIGNED_ARRAY_16( pixel, pix1,[16*16] ); pixel *pix2 = pix1+8; - const int i_stride = h->mb.pic.i_stride[1]; - const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride; - const int i_ref = a->l0.me8x8[i8x8].i_ref; - const int mvy_offset = MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + int i_stride = h->mb.pic.i_stride[1]; + int chroma_h_shift = chroma <= CHROMA_422; + int chroma_v_shift = chroma == CHROMA_420; + int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride; + int i_ref = a->l0.me8x8[i8x8].i_ref; + int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; x264_weight_t *weight = h->sh.weight[i_ref]; // FIXME weight can be done on 4x4 blocks even if mc is smaller #define CHROMA4x4MC( width, height, me, x, y ) \ - if( CHROMA444 ) \ + if( chroma == CHROMA_444 ) \ { \ int mvx = (me).mv[0] + 4*2*x; \ int mvy = (me).mv[1] + 4*2*y; \ @@ -1695,14 +1698,16 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, } \ else \ { \ - h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \ + int offset = x + (2>>chroma_v_shift)*16*y; \ + int chroma_height = (2>>chroma_v_shift)*height; \ + h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \ + (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \ if( weight[1].weightfn ) \ - weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \ + weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \ if( weight[2].weightfn ) \ - weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); \ + weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \ } - if( size == PIXEL_4x4 ) { x264_me_t *m = a->l0.me4x4[i8x8]; @@ -1723,13 +1728,24 @@ static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, CHROMA4x4MC( 2,4, m[0], 0,0 ); CHROMA4x4MC( 2,4, m[1], 2,0 ); } +#undef CHROMA4x4MC - int oe = (8*(i8x8&1) + 4*(i8x8&2)*FENC_STRIDE) >> !CHROMA444; - int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4; + int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE; + int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4; return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 ) + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 ); } +static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size ) +{ + if( CHROMA_FORMAT == CHROMA_444 ) + return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 ); + else if( CHROMA_FORMAT == CHROMA_422 ) + return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 ); + else + return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 ); +} + static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 ) { pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref]; @@ -1845,47 +1861,46 @@ static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t * { ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] ); ALIGNED_ARRAY_16( pixel, bi, [2],[16*16] ); - int l0_mvy_offset, l1_mvy_offset; int i_chroma_cost = 0; + int chromapix = h->luma2chroma_pixel[i_pixel]; #define COST_BI_CHROMA( m0, m1, width, height ) \ { \ if( CHROMA444 ) \ { \ h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \ - m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \ + m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \ h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \ - m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \ + m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \ h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \ - m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \ + m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \ h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \ - m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \ - h->mc.avg[i_pixel]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ - h->mc.avg[i_pixel]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ - i_chroma_cost = h->pixf.mbcmp[i_pixel]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \ - i_chroma_cost += h->pixf.mbcmp[i_pixel]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \ + m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \ } \ else \ { \ - l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ - l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ - h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \ - h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \ - h->mc.avg[i_pixel+3]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ - h->mc.avg[i_pixel+3]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ - i_chroma_cost = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \ - i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \ + int v_shift = h->mb.chroma_v_shift; \ + int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ + int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \ + h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \ + m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \ + h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \ + m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \ } \ + h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ + h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \ + i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \ + + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \ } if( i_pixel == PIXEL_16x16 ) - COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 8, 8 ) + COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 ) else if( i_pixel == PIXEL_16x8 ) - COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 8, 4 ) + COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 ) else if( i_pixel == PIXEL_8x16 ) - COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 4, 8 ) + COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 ) else - COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 4, 4 ) + COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 ) return i_chroma_cost; } @@ -1897,12 +1912,12 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) pixel *p_fenc = h->mb.pic.p_fenc[0]; pixel *p_fdec = h->mb.pic.p_fdec[0]; - int s = !CHROMA444; a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT]; if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 ) { - int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4; + int chromapix = h->luma2chroma_pixel[PIXEL_8x8]; + for( int i = 0; i < 4; i++ ) { const int x = (i&1)*8; @@ -1911,10 +1926,12 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) &p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE ); if( h->mb.b_chroma_me ) { - a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE, - &h->mb.pic.p_fdec[1][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE ) - + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE, - &h->mb.pic.p_fdec[2][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE ); + int fenc_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FENC_STRIDE; + int fdec_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FDEC_STRIDE; + a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE, + &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE ) + + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE, + &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE ); } a->i_cost16x16direct += a->i_cost8x8direct[i]; @@ -1924,10 +1941,10 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) } else { - int chromapix = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8; a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE ); if( h->mb.b_chroma_me ) { + int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); } @@ -2055,7 +2072,6 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) if( h->mb.b_chroma_me ) { - ALIGNED_ARRAY_16( pixel, pixuv, [2],[8*FENC_STRIDE] ); ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] ); if( CHROMA444 ) @@ -2071,31 +2087,37 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } else { - if( MB_INTERLACED & a->l0.bi16x16.i_ref ) + ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] ); + int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; + int v_shift = h->mb.chroma_v_shift; + + if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref ) { - int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 ); } else - h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] ); + h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], + h->mb.pic.i_stride[1], 16>>v_shift ); - if( MB_INTERLACED & a->l1.bi16x16.i_ref ) + if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref ) { - int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2; h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 ); } else - h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] ); + h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], + h->mb.pic.i_stride[1], 16>>v_shift ); - h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE, + h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); - h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE, + h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE, h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] ); - cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE ) - + h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE ); + cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE ) + + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE ); } } @@ -3172,11 +3194,11 @@ intra_analysis: else { x264_mb_analyse_intra_chroma( h, &analysis ); - x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma ); + x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma ); } - analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma; - analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma; - analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma; + analysis.i_satd_i16x16 += analysis.i_satd_chroma; + analysis.i_satd_i8x8 += analysis.i_satd_chroma; + analysis.i_satd_i4x4 += analysis.i_satd_chroma; } else x264_mb_analyse_intra( h, &analysis, i_cost ); @@ -3219,8 +3241,9 @@ intra_analysis: h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 ); if( !CHROMA444 ) { - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 ); - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 ); + int height = 16 >> h->mb.chroma_v_shift; + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height ); + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height ); } x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) ); goto intra_analysis; @@ -3583,11 +3606,11 @@ intra_analysis: else { x264_mb_analyse_intra_chroma( h, &analysis ); - x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma ); + x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma ); } - analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma; - analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma; - analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma; + analysis.i_satd_i16x16 += analysis.i_satd_chroma; + analysis.i_satd_i8x8 += analysis.i_satd_chroma; + analysis.i_satd_i4x4 += analysis.i_satd_chroma; } else x264_mb_analyse_intra( h, &analysis, i_satd_inter ); diff --git a/encoder/cabac.c b/encoder/cabac.c index 491b4ee7..c575724e 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -210,8 +210,8 @@ static void x264_cabac_mb_intra4x4_pred_mode( x264_cabac_t *cb, int i_pred, int static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb ) { - const int i_mode = x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode]; - int ctx = 0; + int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]; + int ctx = 0; /* No need to test for I4x4 or I_16x16 as cache_save handle that */ if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 ) @@ -485,7 +485,7 @@ static inline void x264_cabac_mb8x8_mvd( x264_t *h, x264_cabac_t *cb, int i ) * 1-> AC 16x16 i_idx = luma4x4idx * 2-> Luma4x4 i_idx = luma4x4idx * 3-> DC Chroma i_idx = iCbCr - * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx + * 4-> AC Chroma i_idx = numChroma4x4Blks * iCbCr + chroma4x4idx * 5-> Luma8x8 i_idx = luma8x8idx */ @@ -567,6 +567,7 @@ static const uint8_t last_coeff_flag_offset_8x8[63] = 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8 }; +static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */ // node ctx: 0..3: abslevel1 (with abslevelgt1 == 0). // 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter). @@ -574,6 +575,9 @@ static const uint8_t last_coeff_flag_offset_8x8[63] = static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 }; /* map node ctx => cabac ctx for level>1 */ static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 }; +/* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that + * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */ +static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 }; static const uint8_t coeff_abs_level_transition[2][8] = { /* update node ctx after coding a level=1 */ { 1, 2, 3, 3, 4, 5, 6, 7 }, @@ -583,18 +587,17 @@ static const uint8_t coeff_abs_level_transition[2][8] = { static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63}; #if !RDO_SKIP_BS -static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +static ALWAYS_INLINE void block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc ) { - const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat]; - int coeff_idx = -1, node_ctx = 0, last; - int coeffs[64]; - - last = h->quantf.coeff_last[ctx_block_cat]( l ); + int coeff_idx = -1, node_ctx = 0; + int last = h->quantf.coeff_last[ctx_block_cat]( l ); + const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; + dctcoef coeffs[64]; -#define WRITE_SIGMAP( l8x8 )\ +#define WRITE_SIGMAP( sig_off, last_off )\ {\ int i = 0;\ while( 1 )\ @@ -602,19 +605,18 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo if( l[i] )\ {\ coeffs[++coeff_idx] = l[i];\ - x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\ + x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\ if( i == last )\ {\ - x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\ + x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\ break;\ }\ else\ - x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\ + x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\ }\ else\ - x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\ - i++;\ - if( i == count_m1 )\ + x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\ + if( ++i == count_m1 )\ {\ coeffs[++coeff_idx] = l[i];\ break;\ @@ -622,11 +624,22 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo }\ } - int count_m1 = count_cat_m1[ctx_block_cat]; - if( count_m1 == 63 ) - WRITE_SIGMAP( 1 ) + if( chroma422dc ) + { + int count_m1 = 7; + WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] ) + } else - WRITE_SIGMAP( 0 ) + { + int count_m1 = count_cat_m1[ctx_block_cat]; + if( count_m1 == 63 ) + { + const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; + WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] ) + } + else + WRITE_SIGMAP( i, i ) + } do { @@ -639,7 +652,7 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo if( abs_coeff > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level; + ctx = levelgt1_ctx[node_ctx] + ctx_level; for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- ) x264_cabac_encode_decision( cb, ctx, 1 ); if( abs_coeff < 15 ) @@ -658,15 +671,23 @@ static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_blo x264_cabac_encode_bypass( cb, coeff_sign ); } while( --coeff_idx >= 0 ); } +static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 ); +} +static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + /* Template a version specifically for chroma 4:2:2 DC in order to avoid + * slowing down everything else due to the added complexity. */ + block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 1 ); +} #define block_residual_write_cabac_8x8( h, cb, cat, l ) block_residual_write_cabac( h, cb, cat, l ) - #else -/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct - * this is slightly incorrect because the sigmap is not reversible - * (contexts are repeated). However, there is nearly no quality penalty - * for this (~0.001db) and the speed boost (~30%) is worth it. */ -static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8 ) +/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is + * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there + * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */ +static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc ) { const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED]; int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat]; @@ -676,17 +697,20 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c int coeff_abs = abs(l[last]); int ctx = coeff_abs_level1_ctx[0] + ctx_level; int node_ctx; + const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; - if( last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) ) + if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) ) { - x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : last), 1 ); - x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : last), 1 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : + chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : + chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 ); } if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[0] + ctx_level; + ctx = levelgt1_ctx[0] + ctx_level; if( coeff_abs < 15 ) { cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; @@ -712,14 +736,16 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c if( l[i] ) { coeff_abs = abs(l[i]); - x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 1 ); - x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : + chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 ); + x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : + chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 ); ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level; if( coeff_abs > 1 ) { x264_cabac_encode_decision( cb, ctx, 1 ); - ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level; + ctx = levelgt1_ctx[node_ctx] + ctx_level; if( coeff_abs < 15 ) { cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]]; @@ -741,45 +767,49 @@ static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_c } } else - x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 0 ); + x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : + chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 ); } } static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1 ); + block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1, 0 ); +} +static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) +{ + block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 ); } static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { - block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 ); + block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0, 0 ); } #endif -#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ +#define block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, name )\ do\ {\ int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\ if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\ {\ x264_cabac_encode_decision( cb, ctxidxinc, 1 );\ - block_residual_write_cabac( h, cb, ctx_block_cat, l );\ + block_residual_write_cabac##name( h, cb, ctx_block_cat, l );\ }\ else\ x264_cabac_encode_decision( cb, ctxidxinc, 0 );\ } while(0) +#define block_residual_write_cabac_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ + block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, ) + +#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ + block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, ) + #define block_residual_write_cabac_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\ -do\ -{\ - int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\ - if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\ - {\ - x264_cabac_encode_decision( cb, ctxidxinc, 1 );\ - block_residual_write_cabac_8x8( h, cb, ctx_block_cat, l );\ - }\ - else\ - x264_cabac_encode_decision( cb, ctxidxinc, 0 );\ -} while(0) + block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, _8x8 ) + +#define block_residual_write_cabac_422_dc_cbf( h, cb, ch, b_intra )\ + block_residual_write_cabac_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, _422_dc ) static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma ) { @@ -808,7 +838,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_ bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] ); if( chroma ) for( int ch = 1; ch < 3; ch++ ) - for( int i = 0; i < 8; i++ ) + for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ ) for( int j = 0; j < 8; j++ ) bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] ); @@ -968,7 +998,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_ x264_cabac_mb_transform_size( h, cb ); } - if( h->mb.i_cbp_luma > 0 || (chroma && h->mb.i_cbp_chroma > 0) || i_mb_type == I_16x16 ) + if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 ) { const int b_intra = IS_INTRA( i_mb_type ); x264_cabac_mb_qp_delta( h, cb ); @@ -979,7 +1009,7 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_ /* DC Luma */ for( int p = 0; p < plane_count; p++ ) { - block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 ); + block_residual_write_cabac_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 ); /* AC Luma */ if( h->mb.i_cbp_luma ) @@ -1054,12 +1084,24 @@ if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */ { - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra ); - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra ); - if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */ - for( int ch = 1; ch < 3; ch++ ) - for( int i = ch*16; i < ch*16+4; i++ ) - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra ); + if( CHROMA_FORMAT == CHROMA_422 ) + { + block_residual_write_cabac_422_dc_cbf( h, cb, 0, b_intra ); + block_residual_write_cabac_422_dc_cbf( h, cb, 1, b_intra ); + } + else + { + block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra ); + block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra ); + } + + if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */ + { + int step = 8 << h->mb.chroma_v_shift; + for( int i = 16; i < 3*16; i += step ) + for( int j = i; j < i+4; j++ ) + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra ); + } } } @@ -1130,8 +1172,19 @@ static void x264_partition_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int if( h->mb.i_cbp_chroma ) { - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 ); - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 ); + if( CHROMA_FORMAT == CHROMA_422 ) + { + int offset = (5*i8) & 0x09; + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 ); + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 ); + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 ); + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 ); + } + else + { + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 ); + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 ); + } } i8 += x264_pixel_size[i_pixel].h >> 3; @@ -1180,19 +1233,30 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 ); } -static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb ) +static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb ) { x264_cabac_mb_intra_chroma_pred_mode( h, cb ); x264_cabac_mb_cbp_chroma( h, cb ); - if( h->mb.i_cbp_chroma > 0 ) + if( h->mb.i_cbp_chroma ) { - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 ); - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 ); + if( CHROMA_FORMAT == CHROMA_422 ) + { + block_residual_write_cabac_422_dc_cbf( h, cb, 0, 1 ); + block_residual_write_cabac_422_dc_cbf( h, cb, 1, 1 ); + } + else + { + block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 ); + block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 ); + } if( h->mb.i_cbp_chroma == 2 ) - for( int ch = 1; ch < 3; ch++ ) - for( int i = ch*16; i < ch*16+4; i++ ) - block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 1 ); + { + int step = 8 << h->mb.chroma_v_shift; + for( int i = 16; i < 3*16; i += step ) + for( int j = i; j < i+4; j++ ) + block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 ); + } } } #endif diff --git a/encoder/cavlc.c b/encoder/cavlc.c index dcf4e9b4..07397e0a 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -122,10 +122,9 @@ static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dc { bs_t *s = &h->out.bs; static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0}; - static const uint8_t count_cat[14] = {16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64}; + static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64}; x264_run_level_t runlevel; - int i_trailing, i_total_zero, i_suffix_length; - int i_total = 0; + int i_total, i_trailing, i_total_zero, i_suffix_length; unsigned int i_sign; /* level and run and total */ @@ -177,13 +176,17 @@ static int block_residual_write_cavlc_internal( x264_t *h, int ctx_block_cat, dc } } - if( (uint8_t)i_total < count_cat[ctx_block_cat] ) + if( ctx_block_cat == DCT_CHROMA_DC ) { - if( ctx_block_cat == DCT_CHROMA_DC ) - bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] ); - else - bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] ); + if( i_total < 8>>h->mb.chroma_v_shift ) + { + vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero] + : x264_total_zeros_2x4_dc[i_total-1][i_total_zero]; + bs_write_vlc( s, total_zeros ); + } } + else if( (uint8_t)i_total < count_cat[ctx_block_cat] ) + bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] ); for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ ) { @@ -199,7 +202,8 @@ static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3}; #define block_residual_write_cavlc(h,cat,idx,l)\ {\ - int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\ + int nC = cat == DCT_CHROMA_DC ? 3 + CHROMA_FORMAT\ + : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\ uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\ if( !*nnz )\ bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\ @@ -323,7 +327,7 @@ void x264_macroblock_write_cavlc( x264_t *h ) bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] ); if( chroma ) for( int ch = 1; ch < 3; ch++ ) - for( int i = 0; i < 8; i++ ) + for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ ) for( int j = 0; j < 8; j++ ) bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] ); @@ -358,14 +362,14 @@ void x264_macroblock_write_cavlc( x264_t *h ) bs_write( s, 4, i_mode - (i_mode > i_pred) ); } if( chroma ) - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); + bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] ); } else if( i_mb_type == I_16x16 ) { bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] + h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) ); if( chroma ) - bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); + bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] ); } else if( i_mb_type == P_L0 ) { @@ -539,10 +543,13 @@ void x264_macroblock_write_cavlc( x264_t *h ) /* Chroma DC residual present */ block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] ); block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] ); - if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */ - for( int ch = 1; ch < 3; ch++ ) - for( int i = ch*16; i < ch*16+4; i++ ) - block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 ); + if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */ + { + int step = 8 << h->mb.chroma_v_shift; + for( int i = 16; i < 3*16; i += step ) + for( int j = i; j < i+4; j++ ) + block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 ); + } } #if !RDO_SKIP_BS @@ -592,8 +599,19 @@ static int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel ) x264_macroblock_luma_write_cavlc( h, p*4+i8, p*4+i8 ); if( h->mb.i_cbp_chroma ) { - block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 ); - block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 ); + if( CHROMA_FORMAT == CHROMA_422 ) + { + int offset = (5*i8) & 0x09; + block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 ); + block_residual_write_cavlc( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 ); + block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 ); + block_residual_write_cavlc( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 ); + } + else + { + block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 ); + block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 ); + } } i8 += x264_pixel_size[i_pixel].h >> 3; } @@ -644,18 +662,21 @@ static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode ) return h->out.bs.i_bits_encoded; } -static int x264_i8x8_chroma_size_cavlc( x264_t *h ) +static int x264_chroma_size_cavlc( x264_t *h ) { - h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] ); + h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] ); if( h->mb.i_cbp_chroma ) { block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] ); block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] ); if( h->mb.i_cbp_chroma == 2 ) - for( int ch = 1; ch < 3; ch++ ) - for( int i = ch*16; i < ch*16+4; i++ ) - block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 ); + { + int step = 8 << h->mb.chroma_v_shift; + for( int i = 16; i < 3*16; i += step ) + for( int j = i; j < i+4; j++ ) + block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 ); + } } return h->out.bs.i_bits_encoded; } diff --git a/encoder/encoder.c b/encoder/encoder.c index 987b39a4..4c47a998 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -71,7 +71,7 @@ static void x264_frame_dump( x264_t *h ) return; /* Write the frame in display order */ - int frame_size = h->param.i_height * h->param.i_width * (3<param.i_height * h->param.i_width * sizeof(pixel) ); fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET ); for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ ) for( int y = 0; y < h->param.i_height; y++ ) @@ -79,7 +79,7 @@ static void x264_frame_dump( x264_t *h ) if( !CHROMA444 ) { int cw = h->param.i_width>>1; - int ch = h->param.i_height>>1; + int ch = h->param.i_height>>h->mb.chroma_v_shift; pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) ); pixel *planev = planeu + cw*ch + 16; h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch ); @@ -90,7 +90,6 @@ static void x264_frame_dump( x264_t *h ) fclose( f ); } - /* Fill "default" values */ static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh, x264_sps_t *sps, x264_pps_t *pps, @@ -400,6 +399,17 @@ static int x264_validate_parameters( x264_t *h, int b_open ) return -1; } #endif + +#if HAVE_INTERLACED + h->param.b_interlaced = !!PARAM_INTERLACED; +#else + if( h->param.b_interlaced ) + { + x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" ); + return -1; + } +#endif + if( h->param.i_width <= 0 || h->param.i_height <= 0 ) { x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n", @@ -410,26 +420,30 @@ static int x264_validate_parameters( x264_t *h, int b_open ) int i_csp = h->param.i_csp & X264_CSP_MASK; if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX ) { - x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I444/YV24/BGR/BGRA/RGB supported)\n" ); + x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" ); return -1; } - if( i_csp < X264_CSP_I444 && (h->param.i_width % 2 || h->param.i_height % 2) ) + if( i_csp < X264_CSP_I444 && h->param.i_width % 2 ) { - x264_log( h, X264_LOG_ERROR, "width or height not divisible by 2 (%dx%d)\n", + x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n", h->param.i_width, h->param.i_height ); return -1; } -#if HAVE_INTERLACED - h->param.b_interlaced = !!PARAM_INTERLACED; -#else - if( h->param.b_interlaced ) + if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 ) { - x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" ); + x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n", + h->param.i_width, h->param.i_height ); + return -1; + } + + if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 ) + { + x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n", + h->param.i_width, h->param.i_height ); return -1; } -#endif if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width || (h->param.crop_rect.i_top + h->param.crop_rect.i_bottom) >= h->param.i_height ) @@ -927,7 +941,8 @@ static void mbcmp_init( x264_t *h ) memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) ); memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) ); h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16; - h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c; + h->pixf.intra_mbcmp_x3_8x16c = satd ? h->pixf.intra_satd_x3_8x16c : h->pixf.intra_sad_x3_8x16c; + h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c; h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8; h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4; h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL @@ -938,6 +953,39 @@ static void mbcmp_init( x264_t *h ) memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) ); } +static void chroma_dsp_init( x264_t *h ) +{ + memcpy( h->luma2chroma_pixel, x264_luma2chroma_pixel[CHROMA_FORMAT], sizeof(h->luma2chroma_pixel) ); + + switch( CHROMA_FORMAT ) + { + case CHROMA_420: + memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) ); + h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420; + h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra; + h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff; + h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_420_intra_mbaff; + h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x8c; + h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last4; + h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run4; + break; + case CHROMA_422: + memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) ); + h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422; + h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra; + h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff; + h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_422_intra_mbaff; + h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x16c; + h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last8; + h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8; + break; + case CHROMA_444: + h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff; + h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff; + break; + } +} + static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial ) { /* VUI */ @@ -1039,6 +1087,10 @@ x264_t *x264_encoder_open( x264_param_t *param ) h->mb.i_mb_width = h->sps->i_mb_width; h->mb.i_mb_height = h->sps->i_mb_height; h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height; + + h->mb.chroma_h_shift = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422; + h->mb.chroma_v_shift = CHROMA_FORMAT == CHROMA_420; + /* Adaptive MBAFF and subme 0 are not supported as we require halving motion * vectors during prediction, resulting in hpel mvs. * The chosen solution is to make MBAFF non-adaptive in this case. */ @@ -1092,6 +1144,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) /* init CPU functions */ x264_predict_16x16_init( h->param.cpu, h->predict_16x16 ); x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); + x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c ); x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter ); x264_predict_4x4_init( h->param.cpu, h->predict_4x4 ); if( h->param.b_cabac ) @@ -1109,6 +1162,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) x264_dct_init_weights(); mbcmp_init( h ); + chroma_dsp_init( h ); p = buf + sprintf( buf, "using cpu capabilities:" ); for( int i = 0; x264_cpu_names[i].flags; i++ ) @@ -1238,6 +1292,7 @@ x264_t *x264_encoder_open( x264_param_t *param ) h->sps->i_profile_idc == PROFILE_MAIN ? "Main" : h->sps->i_profile_idc == PROFILE_HIGH ? "High" : h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 == 1 ? "High 10 Intra" : "High 10") : + h->sps->i_profile_idc == PROFILE_HIGH422 ? (h->sps->b_constraint_set3 == 1 ? "High 4:2:2 Intra" : "High 4:2:2") : h->sps->b_constraint_set3 == 1 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive"; char level[4]; snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 ); @@ -1252,8 +1307,9 @@ x264_t *x264_encoder_open( x264_param_t *param ) } else { + static const char * const subsampling[4] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" }; x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s %d-bit\n", - profile, level, CHROMA444 ? "4:4:4" : "4:2:0", BIT_DEPTH ); + profile, level, subsampling[CHROMA_FORMAT], BIT_DEPTH ); } return h; @@ -1776,7 +1832,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop ) * consistency by copying deblocked pixels between planes. */ if( PARAM_INTERLACED ) for( int p = 0; p < h->fdec->i_plane; p++ ) - for( int i = minpix_y>>(!CHROMA444 && p); i < maxpix_y>>(!CHROMA444 && p); i++ ) + for( int i = minpix_y>>(h->mb.chroma_v_shift && p); i < maxpix_y>>(h->mb.chroma_v_shift && p); i++ ) memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p], h->fdec->plane[p] + i*h->fdec->i_stride[p], h->mb.i_mb_width*16*sizeof(pixel) ); @@ -1815,10 +1871,11 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y, int b_inloop ) if( !CHROMA444 ) { uint64_t ssd_u, ssd_v; + int v_shift = h->mb.chroma_v_shift; x264_pixel_ssd_nv12( &h->pixf, - h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1], - h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1], - h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v ); + h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1], + h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1], + h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v ); h->stat.frame.i_ssd[1] += ssd_u; h->stat.frame.i_ssd[2] += ssd_v; } @@ -2263,7 +2320,7 @@ reencode: else //if( h->mb.i_type == I_4x4 ) for( int i = 0; i < 16; i++ ) h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++; - h->stat.frame.i_mb_pred_mode[3][x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode]]++; + h->stat.frame.i_mb_pred_mode[3][x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]]++; } h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED; } @@ -3141,7 +3198,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current, h->stat.frame.i_ssd[2], }; int luma_size = h->param.i_width * h->param.i_height; - int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2); + int chroma_size = CHROMA_SIZE( luma_size ); double psnr_y = x264_psnr( ssd[0], luma_size ); double psnr_u = x264_psnr( ssd[1], chroma_size ); double psnr_v = x264_psnr( ssd[2], chroma_size ); @@ -3232,9 +3289,7 @@ static void x264_print_intra( int64_t *i_mb_count, double i_count, int b_print_p ****************************************************************************/ void x264_encoder_close ( x264_t *h ) { - int luma_size = h->param.i_width * h->param.i_height; - int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2); - int64_t i_yuv_size = luma_size + chroma_size * 2; + int64_t i_yuv_size = FRAME_SIZE( h->param.i_width * h->param.i_height ); int64_t i_mb_count_size[2][7] = {{0}}; char buf[200]; int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM] @@ -3470,7 +3525,7 @@ void x264_encoder_close ( x264_t *h ) } for( int i = 0; i <= I_PRED_CHROMA_DC_128; i++ ) { - fixed_pred_modes[3][x264_mb_pred_mode8x8c_fix[i]] += h->stat.i_mb_pred_mode[3][i]; + fixed_pred_modes[3][x264_mb_chroma_pred_mode_fix[i]] += h->stat.i_mb_pred_mode[3][i]; sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i]; } if( sum_pred_modes[3] && !CHROMA444 ) diff --git a/encoder/macroblock.c b/encoder/macroblock.c index a8768c57..0dfebb26 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -6,6 +6,7 @@ * Authors: Laurent Aimar * Loren Merritt * Fiona Glaser + * Henrik Gramner * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,7 +41,19 @@ static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] ) } #undef ZIG -#define IDCT_DEQUANT_START \ +static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] ) +{ + level[0] = dct[0]; + level[1] = dct[2]; + level[2] = dct[1]; + level[3] = dct[4]; + level[4] = dct[6]; + level[5] = dct[3]; + level[6] = dct[5]; + level[7] = dct[7]; +} + +#define IDCT_DEQUANT_2X2_START \ int d0 = dct[0] + dct[1]; \ int d1 = dct[2] + dct[3]; \ int d2 = dct[0] - dct[1]; \ @@ -49,21 +62,22 @@ static inline void zigzag_scan_2x2_dc( dctcoef level[4], dctcoef dct[4] ) static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp ) { - IDCT_DEQUANT_START + IDCT_DEQUANT_2X2_START dct4x4[0][0] = (d0 + d1) * dmf >> 5; dct4x4[1][0] = (d0 - d1) * dmf >> 5; dct4x4[2][0] = (d2 + d3) * dmf >> 5; dct4x4[3][0] = (d2 - d3) * dmf >> 5; } -static inline void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp ) +static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp ) { - IDCT_DEQUANT_START - out[0] = (d0 + d1) * dmf >> 5; - out[1] = (d0 - d1) * dmf >> 5; - out[2] = (d2 + d3) * dmf >> 5; - out[3] = (d2 - d3) * dmf >> 5; + IDCT_DEQUANT_2X2_START + dct[0] = (d0 + d1) * dmf >> 5; + dct[1] = (d0 - d1) * dmf >> 5; + dct[2] = (d2 + d3) * dmf >> 5; + dct[3] = (d2 - d3) * dmf >> 5; } +#undef IDCT_2X2_DEQUANT_START static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] ) { @@ -81,6 +95,23 @@ static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] ) dct4x4[3][0] = 0; } +static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count ) +{ + if( WORD_SIZE == 8 ) + { + for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) ) + if( M64( &v[i] ) ) + return 1; + } + else + { + for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) ) + if( M32( &v[i] ) ) + return 1; + } + return 0; +} + static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx ) { int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY); @@ -236,7 +267,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp ) block_cbp |= nz; } h->mb.i_cbp_luma |= block_cbp * 0xf; - h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4 ); + h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 ); h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 ); return; } @@ -278,7 +309,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp ) h->dctf.dct4x4dc( dct_dc4x4 ); if( h->mb.b_trellis ) - nz = x264_quant_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, 0, LUMA_DC+p ); + nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p ); else nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 ); @@ -306,7 +337,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp ) * Unlike luma blocks, this can't be done with a lookup table or * other shortcut technique because of the interdependencies * between the coefficients due to the chroma DC transform. */ -static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4], int dequant_mf[6][16], int i_qp ) +static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 ) { int dmf = dequant_mf[i_qp%6][0] << i_qp/6; @@ -314,14 +345,18 @@ static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4 if( dmf > 32*64 ) return 1; - return h->quantf.optimize_chroma_dc( dct2x2, dmf ); + if( chroma422 ) + return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf ); + else + return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf ); } -void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) +static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 ) { int nz, nz_dc; int b_decimate = b_inter && h->mb.b_dct_decimate; - ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] ); + int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter]; + ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] ); h->mb.i_cbp_chroma = 0; h->nr_count[2] += h->mb.b_noise_reduction * 4; @@ -330,17 +365,26 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) * Values are experimentally derived. */ if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction ) { - int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6; + int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; int ssd[2]; - int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] ); + int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8; + + int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] ); if( score < thresh*4 ) - score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] ); + score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] ); if( score < thresh*4 ) { M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0; M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0; + if( chroma422 ) + { + M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0; + } h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0; @@ -348,20 +392,43 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) { if( ssd[ch] > thresh ) { - h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] ); + pixel *p_src = h->mb.pic.p_fenc[1+ch]; + pixel *p_dst = h->mb.pic.p_fdec[1+ch]; + + if( chroma422 ) + /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */ + h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst ); + else + h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst ); + if( h->mb.b_trellis ) - nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch ); + nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch ); else - nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 ); + { + nz_dc = 0; + for( int i = 0; i <= chroma422; i++ ) + nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1, + h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 ); + } if( nz_dc ) { - if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) + if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) ) continue; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1; - zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); - idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); - h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 ); + if( chroma422 ) + { + zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); + h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 ); + } + else + { + zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); + idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp ); + } + + for( int i = 0; i <= chroma422; i++ ) + h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] ); h->mb.i_cbp_chroma = 1; } } @@ -377,78 +444,120 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) int i_decimate_score = 0; int nz_ac = 0; - ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] ); + ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { - for( int i = 0; i < 4; i++ ) + static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 }; + + for( int i = 0; i < (chroma422?8:4); i++ ) { - int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE; - int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE; - nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*16], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] ); - h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz; + int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE; + int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE; + nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od, + &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] ); + h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz; h->mb.i_cbp_chroma |= nz; } - h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch] ); + h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 ); continue; } - h->dctf.sub8x8_dct( dct4x4, p_src, p_dst ); + for( int i = 0; i <= chroma422; i++ ) + h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); + if( h->mb.b_noise_reduction ) - for( int i = 0; i < 4; i++ ) + for( int i = 0; i < (chroma422?8:4); i++ ) h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 ); - dct2x2dc( dct2x2, dct4x4 ); + + if( chroma422 ) + h->dctf.dct2x4dc( dct_dc, dct4x4 ); + else + dct2x2dc( dct_dc, dct4x4 ); + /* calculate dct coeffs */ - for( int i = 0; i < 4; i++ ) + for( int i = 0; i < (chroma422?8:4); i++ ) { if( h->mb.b_trellis ) nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 ); else nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); - h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz; + h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz; if( nz ) { nz_ac = 1; - h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*16], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp ); + h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] ); + h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp ); if( b_decimate ) - i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*16] ); + i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] ); } } if( h->mb.b_trellis ) - nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch ); + nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch ); else - nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 ); + { + nz_dc = 0; + for( int i = 0; i <= chroma422; i++ ) + nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1, + h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 ); + } h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc; if( (b_decimate && i_decimate_score < 7) || !nz_ac ) { /* Decimate the block */ - M16( &h->mb.cache.non_zero_count[x264_scan8[16+0+16*ch]] ) = 0; - M16( &h->mb.cache.non_zero_count[x264_scan8[16+2+16*ch]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0; + if( chroma422 ) + { + M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0; + M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0; + } + if( !nz_dc ) /* Whole block is empty */ continue; - if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) ) + if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) ) { h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0; continue; } /* DC-only */ - zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); - idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); - h->dctf.add8x8_idct_dc( p_dst, dct2x2 ); + if( chroma422 ) + { + zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); + h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 ); + } + else + { + zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); + idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp ); + } + + for( int i = 0; i <= chroma422; i++ ) + h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] ); } else { h->mb.i_cbp_chroma = 1; + if( nz_dc ) { - zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); - idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); + if( chroma422 ) + { + zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc ); + h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 ); + } + else + { + zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc ); + idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp ); + } } - h->dctf.add8x8_idct( p_dst, dct4x4 ); + + for( int i = 0; i <= chroma422; i++ ) + h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] ); } } @@ -457,6 +566,14 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma); } +void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp ) +{ + if( CHROMA_FORMAT == CHROMA_420 ) + x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 ); + else + x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 ); +} + static void x264_macroblock_encode_skip( x264_t *h ) { M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0; @@ -467,7 +584,7 @@ static void x264_macroblock_encode_skip( x264_t *h ) M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0; - if( CHROMA444 ) + if( CHROMA_FORMAT >= CHROMA_422 ) { M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0; M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0; @@ -483,26 +600,32 @@ static void x264_macroblock_encode_skip( x264_t *h ) * Intra prediction for predictive lossless mode. *****************************************************************************/ -void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode ) +void x264_predict_lossless_chroma( x264_t *h, int i_mode ) { + int height = 16 >> h->mb.chroma_v_shift; if( i_mode == I_PRED_CHROMA_V ) { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height ); memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) ); memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) ); } else if( i_mode == I_PRED_CHROMA_H ) { - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 ); - h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height ); + h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height ); x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 ); x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 ); + if( CHROMA_FORMAT == CHROMA_422 ) + { + x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 ); + x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 ); + } } else { - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } } @@ -563,8 +686,9 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 ); if( chroma ) { - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 ); - h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 ); + int height = 16 >> h->mb.chroma_v_shift; + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height ); + h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height ); } return; } @@ -598,22 +722,26 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ if( chroma ) { + int v_shift = h->mb.chroma_v_shift; + int height = 16 >> v_shift; + /* Special case for mv0, which is (of course) very common in P-skip mode. */ if( mvx | mvy ) h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], - mvx, mvy, 8, 8 ); + mvx, 2*mvy>>v_shift, 8, height ); else - h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] ); + h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], + h->mb.pic.i_stride[1], height ); if( h->sh.weight[0][1].weightfn ) h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, - &h->sh.weight[0][1], 8 ); + &h->sh.weight[0][1], height ); if( h->sh.weight[0][2].weightfn ) h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, - &h->sh.weight[0][2], 8 ); + &h->sh.weight[0][2], height ); } } @@ -861,18 +989,18 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ { if( IS_INTRA( h->mb.i_type ) ) { - const int i_mode = h->mb.i_chroma_pred_mode; + int i_mode = h->mb.i_chroma_pred_mode; if( h->mb.b_lossless ) - x264_predict_lossless_8x8_chroma( h, i_mode ); + x264_predict_lossless_chroma( h, i_mode ); else { - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] ); - h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] ); + h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] ); } } /* encode the 8x8 blocks */ - x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp ); + x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp ); } else h->mb.i_cbp_chroma = 0; @@ -920,13 +1048,10 @@ void x264_macroblock_encode( x264_t *h ) *****************************************************************************/ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) { - ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] ); - ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] ); + ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] ); ALIGNED_ARRAY_16( dctcoef, dctscan,[16] ); ALIGNED_4( int16_t mvp[2] ); - int i_qp = h->mb.i_qp; - int thresh, ssd; for( int p = 0; p < plane_count; p++ ) { @@ -966,11 +1091,13 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b i_qp = h->mb.i_chroma_qp; } - if( chroma ) + if( chroma == CHROMA_420 || chroma == CHROMA_422 ) { - /* encode chroma */ i_qp = h->mb.i_chroma_qp; - thresh = (x264_lambda2_tab[i_qp] + 32) >> 6; + int chroma422 = chroma == CHROMA_422; + int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; + int ssd; + ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] ); if( !b_bidir ) { @@ -978,9 +1105,10 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b if( M32( mvp ) ) h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], - mvp[0], mvp[1], 8, 8 ); + mvp[0], mvp[1]<mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] ); + h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], + h->mb.pic.i_stride[1], chroma422?16:8 ); } for( int ch = 0; ch < 2; ch++ ) @@ -991,11 +1119,11 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b if( !b_bidir && h->sh.weight[0][1+ch].weightfn ) h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, - &h->sh.weight[0][1+ch], 8 ); + &h->sh.weight[0][1+ch], chroma422?16:8 ); /* there is almost never a termination during chroma, but we can't avoid the check entirely */ /* so instead we check SSD and skip the actual check if the score is low enough. */ - ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); + ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); if( ssd < thresh ) continue; @@ -1003,28 +1131,38 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b * threshold check, so we can save time by doing a DC-only DCT. */ if( h->mb.b_noise_reduction ) { - h->dctf.sub8x8_dct( dct4x4, p_src, p_dst ); - for( int i4x4 = 0; i4x4 < 4; i4x4++ ) + for( int i = 0; i <= chroma422; i++ ) + h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); + + for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ ) { h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); - dct2x2[i4x4] = dct4x4[i4x4][0]; + dct_dc[i4x4] = dct4x4[i4x4][0]; } } else - h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst ); + { + if( chroma422 ) + h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst ); + else + h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst ); + } - if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) ) - return 0; + for( int i = 0; i <= chroma422; i++ ) + if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1, + h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) ) + return 0; /* If there wasn't a termination in DC, we can check against a much higher threshold. */ if( ssd < thresh*4 ) continue; if( !h->mb.b_noise_reduction ) - h->dctf.sub8x8_dct( dct4x4, p_src, p_dst ); + for( int i = 0; i <= chroma422; i++ ) + h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE ); /* calculate dct coeffs */ - for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ ) + for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ ) { dct4x4[i4x4][0] = 0; if( h->mb.b_noise_reduction ) @@ -1045,10 +1183,12 @@ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_b int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) { - if( CHROMA444 ) - return x264_macroblock_probe_skip_internal( h, b_bidir, 3, 0 ); + if( CHROMA_FORMAT == CHROMA_444 ) + return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 ); + else if( CHROMA_FORMAT == CHROMA_422 ) + return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 ); else - return x264_macroblock_probe_skip_internal( h, b_bidir, 1, 1 ); + return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 ); } /**************************************************************************** @@ -1096,6 +1236,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i int x = i8&1; int y = i8>>1; int nz; + int chroma422 = chroma == CHROMA_422; h->mb.i_cbp_chroma = 0; h->mb.i_cbp_luma &= ~(1 << i8); @@ -1128,15 +1269,20 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i } h->mb.i_cbp_luma |= nnz8x8 << i8; } - if( chroma ) + if( chroma == CHROMA_420 || chroma == CHROMA_422 ) { for( int ch = 0; ch < 2; ch++ ) { dctcoef dc; - pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE; - pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE; - nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*16], p_fenc, p_fdec, &dc ); - h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz; + pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; + pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; + + for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ ) + { + int offset = chroma422 ? 8*y + 2*i4x4 + x : i8; + nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc ); + h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz; + } } h->mb.i_cbp_chroma = 0x02; } @@ -1212,30 +1358,36 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i } } - if( chroma ) + if( chroma == CHROMA_420 || chroma == CHROMA_422 ) { i_qp = h->mb.i_chroma_qp; for( int ch = 0; ch < 2; ch++ ) { - ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] ); - pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE; - pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE; - h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); - if( h->mb.b_noise_reduction ) - h->quantf.denoise_dct( dct4x4, h->nr_residual_sum[2], h->nr_offset[2], 16 ); - dct4x4[0] = 0; + ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] ); + pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; + pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; - if( h->mb.b_trellis ) - nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 ); - else - nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); - - h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz; - if( nz ) + for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ ) { - h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*16], dct4x4 ); - h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp ); - h->dctf.add4x4_idct( p_fdec, dct4x4 ); + h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE ); + + if( h->mb.b_noise_reduction ) + h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 ); + dct4x4[i4x4][0] = 0; + + if( h->mb.b_trellis ) + nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 ); + else + nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); + + int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8; + h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz; + if( nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] ); + h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp ); + h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] ); + } } } h->mb.i_cbp_chroma = 0x02; @@ -1246,9 +1398,11 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { if( CHROMA444 ) - x264_macroblock_encode_p8x8_internal( h, i8, 3, 0 ); + x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 ); + else if( CHROMA_FORMAT == CHROMA_422 ) + x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 ); else - x264_macroblock_encode_p8x8_internal( h, i8, 1, 1 ); + x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 ); } /***************************************************************************** diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 5e3b188d..d8ca95dc 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -41,10 +41,10 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ); #define x264_macroblock_probe_bskip( h )\ x264_macroblock_probe_skip( h, 1 ) -void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode ); void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode ); void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] ); void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode ); +void x264_predict_lossless_chroma( x264_t *h, int i_mode ); void x264_macroblock_encode ( x264_t *h ); void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb ); @@ -54,12 +54,13 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ); void x264_macroblock_encode_p4x4( x264_t *h, int i4 ); void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode ); void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge ); -void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ); +void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp ); void x264_cabac_mb_skip( x264_t *h, int b_skip ); -int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, - int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx ); +int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, + int ctx_block_cat, int b_intra, int idx ); +int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx ); int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx ); int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, diff --git a/encoder/me.c b/encoder/me.c index e21f2ca8..1c8c8bb3 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -805,17 +805,16 @@ if( b_refine_qpel || (dir^1) != odir ) \ } \ else \ { \ - h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ + h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \ + mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \ if( m->weight[1].weightfn ) \ - m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \ - &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \ + m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \ + cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \ if( cost < bcost ) \ { \ if( m->weight[2].weightfn ) \ - m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \ - &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \ - cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \ + m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \ + cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \ } \ } \ } \ @@ -830,7 +829,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; const int i_pixel = m->i_pixel; const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444); - const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + int chromapix = h->luma2chroma_pixel[i_pixel]; + int chroma_v_shift = h->mb.chroma_v_shift; + int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment @@ -952,7 +953,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite }\ else\ h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\ - mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ + mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\ }\ } @@ -976,14 +977,17 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] ); ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] ); pixel *src[3][2][9]; - int chromasize = CHROMA444 ? 8 : 4; + int chromapix = h->luma2chroma_pixel[i_pixel]; + int chroma_v_shift = h->mb.chroma_v_shift; + int chroma_x = (8 >> h->mb.chroma_h_shift) * x; + int chroma_y = (8 >> chroma_v_shift) * y; pixel *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE]; - pixel *pixu = &h->mb.pic.p_fdec[1][chromasize*x + chromasize*y*FDEC_STRIDE]; - pixel *pixv = &h->mb.pic.p_fdec[2][chromasize*x + chromasize*y*FDEC_STRIDE]; + pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE]; + pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE]; int ref0 = h->mb.cache.ref[0][s8]; int ref1 = h->mb.cache.ref[1][s8]; - const int mv0y_offset = MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - const int mv1y_offset = MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; int stride[3][2][9]; int bm0x = m0->mv[0]; int bm0y = m0->mv[1]; @@ -1071,8 +1075,8 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m } else { - h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight ); - h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight ); + h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight ); + h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight ); } uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); COPY2_IF_LT( bcostrd, costrd, bestj, j ); @@ -1153,13 +1157,12 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei } \ else if( m->i_pixel <= PIXEL_8x8 ) \ { \ - h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \ + h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \ + mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \ if( m->weight[1].weightfn ) \ - m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, \ - &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \ + m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \ if( m->weight[2].weightfn ) \ - m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, \ - &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \ + m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \ } \ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ @@ -1173,7 +1176,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; - const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + int chroma_v_shift = h->mb.chroma_v_shift; + int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; uint64_t bcost = COST_MAX64; int bmx = m->mv[0]; @@ -1193,8 +1197,8 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int } else { - pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; - pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; + pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4]; + pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4]; } h->mb.b_skip_mc = 1; diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index a64a9997..dfe52248 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -219,18 +219,21 @@ static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_f static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store ) { - int w = b_chroma ? 8 : 16; + int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16; int stride = frame->i_stride[i]; int offset = b_field - ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride - : 16 * mb_x + w * mb_y * stride; + ? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride + : 16 * mb_x + height * mb_y * stride; stride <<= b_field; if( b_chroma ) { - ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] ); - h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride ); - return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1, b_store ) - + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2, b_store ); + ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] ); + int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; + int shift = 7 - h->mb.chroma_v_shift; + + h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height ); + return ac_energy_var( h->pixf.var[chromapix]( pix, FENC_STRIDE ), shift, frame, 1, b_store ) + + ac_energy_var( h->pixf.var[chromapix]( pix+FENC_STRIDE/2, FENC_STRIDE ), shift, frame, 2, b_store ); } else return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store ); @@ -379,9 +382,8 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off { uint64_t ssd = frame->i_pixel_ssd[i]; uint64_t sum = frame->i_pixel_sum[i]; - int size = CHROMA444 || !i ? 16 : 8; - int width = h->mb.i_mb_width*size; - int height = h->mb.i_mb_height*size; + int width = 16*h->mb.i_mb_width >> (i && h->mb.chroma_h_shift); + int height = 16*h->mb.i_mb_height >> (i && h->mb.chroma_v_shift); frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height); } } @@ -1279,8 +1281,8 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead ) if( h->param.b_bluray_compat ) mincr = 4; - /* High 10 / High 4:4:4 Predictive doesn't require minCR, so just set the maximum to a large value. */ - if( h->sps->i_profile_idc >= PROFILE_HIGH10 ) + /* Profiles above High don't require minCR, so just set the maximum to a large value. */ + if( h->sps->i_profile_idc > PROFILE_HIGH ) rc->frame_size_maximum = 1e9; else { diff --git a/encoder/rdo.c b/encoder/rdo.c index f994fa02..4ca07508 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -146,7 +146,7 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y ) static inline int ssd_mb( x264_t *h ) { - int chroma_size = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8; + int chroma_size = h->luma2chroma_pixel[PIXEL_16x16]; int chroma_ssd = ssd_plane(h, chroma_size, 1, 0, 0) + ssd_plane(h, chroma_size, 2, 0, 0); chroma_ssd = ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8; return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chroma_ssd; @@ -227,7 +227,6 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel ) { uint64_t i_ssd, i_bits; int i8 = i4 >> 2; - int chromassd; if( i_pixel == PIXEL_16x16 ) { @@ -246,19 +245,13 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel ) if( i_pixel == PIXEL_8x16 ) x264_macroblock_encode_p8x8( h, i8+2 ); - i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 ); - if( CHROMA444 ) - { - chromassd = ssd_plane( h, i_pixel, 1, (i8&1)*8, (i8>>1)*8 ) - + ssd_plane( h, i_pixel, 2, (i8&1)*8, (i8>>1)*8 ); - } - else - { - chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 ) - + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 ); - } - chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8; - i_ssd += chromassd; + int ssd_x = 8*(i8&1); + int ssd_y = 8*(i8>>1); + i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y ); + int chromapix = h->luma2chroma_pixel[i_pixel]; + int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift ) + + ssd_plane( h, chromapix, 2, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift ); + i_ssd += ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8; if( h->param.b_cabac ) { @@ -343,14 +336,16 @@ static uint64_t x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode return (i_ssd<<8) + i_bits; } -static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) +static uint64_t x264_rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct ) { uint64_t i_ssd, i_bits; if( b_dct ) - x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp ); - i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) + - ssd_plane( h, PIXEL_8x8, 2, 0, 0 ); + x264_mb_encode_chroma( h, 0, h->mb.i_chroma_qp ); + + int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; + i_ssd = ssd_plane( h, chromapix, 1, 0, 0 ) + + ssd_plane( h, chromapix, 2, 0, 0 ); h->mb.i_chroma_pred_mode = i_mode; @@ -358,11 +353,11 @@ static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, { x264_cabac_t cabac_tmp; COPY_CABAC; - x264_i8x8_chroma_size_cabac( h, &cabac_tmp ); + x264_chroma_size_cabac( h, &cabac_tmp ); i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else - i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2; + i_bits = x264_chroma_size_cavlc( h ) * i_lambda2; return (i_ssd<<8) + i_bits; } @@ -443,7 +438,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, int ctx_block_cat, int i_lambda2, int b_ac, int b_chroma, int dc, int i_coefs, int idx ) { - int abs_coefs[64], signs[64]; + udctcoef abs_coefs[64]; + int8_t signs[64]; trellis_node_t nodes[2][8]; trellis_node_t *nodes_cur = nodes[0]; trellis_node_t *nodes_prev = nodes[1]; @@ -451,6 +447,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, const int b_interlaced = MB_INTERLACED; uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ]; + const uint8_t *levelgt1_ctx = b_chroma && dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx; const int f = 1 << 15; // no deadzone int i_last_nnz; int i; @@ -486,7 +483,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, { int coef = dct[zigzag[i]]; abs_coefs[i] = abs(coef); - signs[i] = coef < 0 ? -1 : 1; + signs[i] = coef>>31 | 1; } /* init trellis */ @@ -519,7 +516,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, { // no need to calculate ssd of 0s: it's the same in all nodes. // no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s. - int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i; + int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : + b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ) * (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS ); for( int j = 1; j < 8; j++ ) @@ -546,8 +544,10 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, if( i < i_coefs-1 ) { - int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i; - int lastindex = i_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i; + int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : + b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; + int lastindex = !dc && i_coefs == 64 ? last_coeff_flag_offset_8x8[i] : + b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i; cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 ); cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 ); cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ); @@ -599,7 +599,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 ); if( i_prefix > 0 ) { - uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]]; + uint8_t *ctx = &n.cabac_state[levelgt1_ctx[node_ctx]]; f8_bits += cabac_size_unary[i_prefix][*ctx]; *ctx = cabac_transition_unary[i_prefix][*ctx]; if( abs_level >= 15 ) @@ -695,7 +695,8 @@ int quant_trellis_cavlc( x264_t *h, dctcoef *dct, int64_t score = 1ULL<<62; int i, j; const int f = 1<<15; - int nC = ctx_block_cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, ctx_block_cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )]; + int nC = b_chroma && dc ? 3 + (i_coefs>>2) + : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )]; /* Code for handling 8x8dct -> 4x4dct CAVLC munging. Input/output use a different * step/start/end than internal processing. */ @@ -857,24 +858,46 @@ zeroblock: return 0; } -const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3}; - -int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, - int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx ) +int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx ) { if( h->param.b_cabac ) return quant_trellis_cabac( h, dct, - h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED], - ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx ); + h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED], + ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx ); + + return quant_trellis_cavlc( h, dct, + h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED], + DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 ); +} - if( ctx_block_cat != DCT_CHROMA_DC ) - ctx_block_cat = DCT_LUMA_DC; +static const uint8_t x264_zigzag_scan2x2[4] = { 0, 1, 2, 3 }; +static const uint8_t x264_zigzag_scan2x4[8] = { 0, 2, 1, 4, 6, 3, 5, 7 }; + +int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx ) +{ + const uint8_t *zigzag; + int num_coefs; + int quant_cat = CQM_4IC+1 - b_intra; + + if( CHROMA_FORMAT == CHROMA_422 ) + { + zigzag = x264_zigzag_scan2x4; + num_coefs = 8; + } + else + { + zigzag = x264_zigzag_scan2x2; + num_coefs = 4; + } + + if( h->param.b_cabac ) + return quant_trellis_cabac( h, dct, + h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag, + DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx ); return quant_trellis_cavlc( h, dct, - h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], - NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED], - ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx, 0 ); + h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag, + DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 ); } int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, diff --git a/encoder/set.c b/encoder/set.c index a498c945..5e1ff642 100644 --- a/encoder/set.c +++ b/encoder/set.c @@ -104,11 +104,14 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param ) sps->i_id = i_id; sps->i_mb_width = ( param->i_width + 15 ) / 16; sps->i_mb_height= ( param->i_height + 15 ) / 16; - sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? 3 : 1; + sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 : + csp >= X264_CSP_I422 ? CHROMA_422 : CHROMA_420; sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0; - if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == 3 ) + if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 ) sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE; + else if( sps->i_chroma_format_idc == CHROMA_422 ) + sps->i_profile_idc = PROFILE_HIGH422; else if( BIT_DEPTH > 8 ) sps->i_profile_idc = PROFILE_HIGH10; else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT ) @@ -132,11 +135,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param ) sps->b_constraint_set3 = 1; /* level 1b with Baseline, Main or Extended profile is signalled via constraint_set3 */ sps->i_level_idc = 11; } - /* High 10 Intra profile */ - if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH10 ) - sps->b_constraint_set3 = 1; - /* High 4:4:4 Intra profile */ - if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH444_PREDICTIVE ) + /* Intra profiles */ + if( param->i_keyint_max == 1 && sps->i_profile_idc > PROFILE_HIGH ) sps->b_constraint_set3 = 1; sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0; @@ -302,11 +302,12 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps ) bs_write1( s, sps->b_crop ); if( sps->b_crop ) { - int cropshift = sps->i_chroma_format_idc != 3; - bs_write_ue( s, sps->crop.i_left >> cropshift ); - bs_write_ue( s, sps->crop.i_right >> cropshift ); - bs_write_ue( s, sps->crop.i_top >> cropshift ); - bs_write_ue( s, sps->crop.i_bottom >> cropshift ); + int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422; + int v_shift = sps->i_chroma_format_idc == CHROMA_420; + bs_write_ue( s, sps->crop.i_left >> h_shift ); + bs_write_ue( s, sps->crop.i_right >> h_shift ); + bs_write_ue( s, sps->crop.i_top >> v_shift ); + bs_write_ue( s, sps->crop.i_bottom >> v_shift ); } bs_write1( s, sps->b_vui ); @@ -757,7 +758,7 @@ int x264_validate_levels( x264_t *h, int verbose ) int ret = 0; int mbs = h->sps->i_mb_width * h->sps->i_mb_height; int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering; - int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH444_PREDICTIVE ? 16 : + int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 : h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 : h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4; diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 5a91c167..0acda252 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -96,12 +96,11 @@ static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc return ref->lowres[0]; } -/* How data is organized for chroma weightp 4:2:0: +/* How data is organized for 4:2:0/4:2:2 chroma weightp: * [U: ref] [U: fenc] * [V: ref] [V: fenc] * fenc = ref + offset - * v = u + stride * chroma height - * We'll need more room if we do 4:2:2. */ + * v = u + stride * chroma height */ static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv ) { @@ -110,21 +109,23 @@ static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc int i_offset = i_stride / 2; int i_lines = fenc->i_lines[1]; int i_width = fenc->i_width[1]; - int cw = h->mb.i_mb_width << 3; - int ch = h->mb.i_mb_height << 3; + int v_shift = h->mb.chroma_v_shift; + int cw = 8*h->mb.i_mb_width; + int ch = 16*h->mb.i_mb_height >> v_shift; + int height = 16 >> v_shift; if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF ) { x264_frame_expand_border_chroma( h, ref, 1 ); - for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride ) + for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride ) for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 ) { pixel *pixu = dstu + pel_offset_y + pel_offset_x; pixel *pixv = dstv + pel_offset_y + pel_offset_x; - pixel *src1 = ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12 */ + pixel *src1 = ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */ int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0]; int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1]; - h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, mvy, 8, 8 ); + h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height ); } } else @@ -223,15 +224,17 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f int i_lines = fenc->i_lines[1]; int i_width = fenc->i_width[1]; pixel *src = ref + i_offset; - ALIGNED_ARRAY_16( pixel, buf, [8*8] ); + ALIGNED_ARRAY_16( pixel, buf, [8*16] ); int pixoff = 0; + int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; + int height = 16 >> h->mb.chroma_v_shift; ALIGNED_16( static pixel flat[8] ) = {0}; if( w ) { - for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) + for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, pixoff += 8 ) { - w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, 8 ); + w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height ); /* The naive and seemingly sensible algorithm is to use mbcmp as in luma. * But testing shows that for chroma the DC coefficient is by far the most * important part of the coding cost. Thus a more useful chroma weight is @@ -239,16 +242,16 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f * pixels. * * FIXME: add a (faster) asm sum function to replace sad. */ - cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( buf, 8, flat, 0 ) - - h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) ); + cost += abs( h->pixf.sad_aligned[chromapix]( buf, 8, flat, 0 ) - + h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) ); } cost += x264_weight_slice_header_cost( h, w, 1 ); } else - for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride ) + for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride ) for( int x = 0; x < i_width; x += 8, pixoff += 8 ) - cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( &ref[pixoff], i_stride, flat, 0 ) - - h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) ); + cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) - + h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) ); x264_emms(); return cost; } diff --git a/filters/video/depth.c b/filters/video/depth.c index 25dde257..9ea2cbcf 100644 --- a/filters/video/depth.c +++ b/filters/video/depth.c @@ -46,15 +46,17 @@ static int depth_filter_csp_is_supported( int csp ) return csp_mask == X264_CSP_I420 || csp_mask == X264_CSP_I422 || csp_mask == X264_CSP_I444 || - csp_mask == X264_CSP_YV24 || csp_mask == X264_CSP_YV12 || - csp_mask == X264_CSP_NV12; + csp_mask == X264_CSP_YV16 || + csp_mask == X264_CSP_YV24 || + csp_mask == X264_CSP_NV12 || + csp_mask == X264_CSP_NV16; } static int csp_num_interleaved( int csp, int plane ) { int csp_mask = csp & X264_CSP_MASK; - return ( csp_mask == X264_CSP_NV12 && plane == 1 ) ? 2 : 1; + return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1; } /* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been diff --git a/filters/video/resize.c b/filters/video/resize.c index 878a4d77..87687268 100644 --- a/filters/video/resize.c +++ b/filters/video/resize.c @@ -137,6 +137,7 @@ static int convert_csp_to_pix_fmt( int csp ) { case X264_CSP_YV12: /* specially handled via swapping chroma */ case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV420P16 : PIX_FMT_YUV420P; + case X264_CSP_YV16: /* specially handled via swapping chroma */ case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV422P16 : PIX_FMT_YUV422P; case X264_CSP_YV24: /* specially handled via swapping chroma */ case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P; @@ -467,11 +468,11 @@ static int init( hnd_t *handle, cli_vid_filter_t *filter, video_info_t *info, x2 h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp ); h->scale = h->dst; - /* swap chroma planes if YV12/YV24 is involved, as libswscale works with I420/I444 */ + /* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */ int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER); int dst_csp = h->dst_csp & (X264_CSP_MASK | X264_CSP_OTHER); - h->pre_swap_chroma = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV24; - h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV24; + h->pre_swap_chroma = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV16 || src_csp == X264_CSP_YV24; + h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV16 || dst_csp == X264_CSP_YV24; int src_pix_fmt = convert_csp_to_pix_fmt( info->csp ); diff --git a/input/avs.c b/input/avs.c index 59fab8cc..0169746d 100644 --- a/input/avs.c +++ b/input/avs.c @@ -219,15 +219,22 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c } #if !HAVE_SWSCALE /* if swscale is not available, convert the CSP if necessary */ - if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || - (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) ) + if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) || + (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) ) { - FAIL_IF_ERROR( avs_version < 2.6f && opt->output_csp == X264_CSP_I444, "avisynth >= 2.6 is required for i444 output\n" ) + FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444), + "avisynth >= 2.6 is required for i422/i444 output\n" ) - const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : (opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB"); + const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : + opt->output_csp == X264_CSP_I422 ? "YV16" : + opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB"; x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp ); - FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && (vi->width&1 || vi->height&1), - "input clip width or height not divisible by 2 (%dx%d)\n", vi->width, vi->height ) + FAIL_IF_ERROR( opt->output_csp < X264_CSP_I444 && (vi->width&1), + "input clip width not divisible by 2 (%dx%d)\n", vi->width, vi->height ) + FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && info->interlaced && (vi->height&3), + "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height ) + FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1), + "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height ) const char *arg_name[2] = { NULL, "interlaced" }; AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) }; char conv_func[14] = { "ConvertTo" }; @@ -251,13 +258,13 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c info->csp = X264_CSP_BGR | X264_CSP_VFLIP; else if( avs_is_yv24( vi ) ) info->csp = X264_CSP_I444; + else if( avs_is_yv16( vi ) ) + info->csp = X264_CSP_I422; else if( avs_is_yv12( vi ) ) info->csp = X264_CSP_I420; #if HAVE_SWSCALE else if( avs_is_yuy2( vi ) ) info->csp = PIX_FMT_YUYV422 | X264_CSP_OTHER; - else if( avs_is_yv16( vi ) ) - info->csp = X264_CSP_I422; else if( avs_is_yv411( vi ) ) info->csp = PIX_FMT_YUV411P | X264_CSP_OTHER; else if( avs_is_y8( vi ) ) diff --git a/input/input.c b/input/input.c index 084499ae..27c2c3df 100644 --- a/input/input.c +++ b/input/input.c @@ -29,9 +29,11 @@ const x264_cli_csp_t x264_cli_csps[] = { [X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 }, [X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 }, [X264_CSP_I444] = { "i444", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 }, - [X264_CSP_YV24] = { "yv24", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 }, [X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 }, + [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 }, + [X264_CSP_YV24] = { "yv24", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 }, [X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, + [X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 }, [X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 }, [X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 }, [X264_CSP_RGB] = { "rgb", 1, { 3 }, { 1 }, 1, 1 }, diff --git a/input/input.h b/input/input.h index 4a4bb0b2..bd7e4218 100644 --- a/input/input.h +++ b/input/input.h @@ -103,8 +103,7 @@ extern cli_input_t timecode_input; extern cli_input_t cli_input; /* extended colorspace list that isn't supported by libx264 but by the cli */ -#define X264_CSP_I422 X264_CSP_MAX /* yuv 4:2:2 planar */ -#define X264_CSP_CLI_MAX (X264_CSP_MAX+1) /* end of list */ +#define X264_CSP_CLI_MAX X264_CSP_MAX /* end of list */ #define X264_CSP_OTHER 0x4000 /* non x264 colorspace */ typedef struct diff --git a/tools/checkasm.c b/tools/checkasm.c index bb1fafc8..0eb1ed54 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -77,11 +77,12 @@ const char *bench_pattern = ""; char func_name[100]; static bench_func_t benchs[MAX_FUNCS]; -static const char *pixel_names[10] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x2", "2x4", "2x2" }; +static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" }; static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" }; static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" }; static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" }; static const char **intra_predict_8x8_names = intra_predict_4x4_names; +static const char **intra_predict_8x16c_names = intra_predict_8x8c_names; #define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ ) @@ -274,7 +275,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) #define TEST_PIXEL( name, align ) \ ok = 1, used_asm = 0; \ - for( int i = 0; i < 7; i++ ) \ + for( int i = 0; i < 8; i++ ) \ { \ int res_c, res_asm; \ if( pixel_asm.name[i] != pixel_ref.name[i] ) \ @@ -374,24 +375,28 @@ static int check_pixel( int cpu_ref, int cpu_new ) ok = 1; used_asm = 0; TEST_PIXEL_VAR( PIXEL_16x16 ); + TEST_PIXEL_VAR( PIXEL_8x16 ); TEST_PIXEL_VAR( PIXEL_8x8 ); report( "pixel var :" ); - ok = 1; used_asm = 0; - if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 ) - { - int res_c, res_asm, ssd_c, ssd_asm; - set_func_name( "var2_8x8" ); - used_asm = 1; - res_c = call_c( pixel_c.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_c ); - res_asm = call_a( pixel_asm.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_asm ); - if( res_c != res_asm || ssd_c != ssd_asm ) - { - ok = 0; - fprintf( stderr, "var2_8x8: %d != %d or %d != %d [FAILED]\n", res_c, res_asm, ssd_c, ssd_asm ); - } +#define TEST_PIXEL_VAR2( i ) \ + if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \ + { \ + int res_c, res_asm, ssd_c, ssd_asm; \ + set_func_name( "%s_%s", "var2", pixel_names[i] ); \ + used_asm = 1; \ + res_c = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \ + res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \ + if( res_c != res_asm || ssd_c != ssd_asm ) \ + { \ + ok = 0; \ + fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \ + } \ } + ok = 1; used_asm = 0; + TEST_PIXEL_VAR2( PIXEL_8x16 ); + TEST_PIXEL_VAR2( PIXEL_8x8 ); report( "pixel var2 :" ); ok = 1; used_asm = 0; @@ -490,12 +495,14 @@ static int check_pixel( int cpu_ref, int cpu_new ) memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) ); ok = 1; used_asm = 0; TEST_INTRA_X3( intra_satd_x3_16x16, 0 ); + TEST_INTRA_X3( intra_satd_x3_8x16c, 0 ); TEST_INTRA_X3( intra_satd_x3_8x8c, 0 ); TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge ); TEST_INTRA_X3( intra_satd_x3_4x4, 0 ); report( "intra satd_x3 :" ); ok = 1; used_asm = 0; TEST_INTRA_X3( intra_sad_x3_16x16, 0 ); + TEST_INTRA_X3( intra_sad_x3_8x16c, 0 ); TEST_INTRA_X3( intra_sad_x3_8x8c, 0 ); TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge ); TEST_INTRA_X3( intra_sad_x3_4x4, 0 ); @@ -597,7 +604,7 @@ static int check_dct( int cpu_ref, int cpu_new ) ALIGNED_16( dctcoef dct2[16][16] ); ALIGNED_16( dctcoef dct4[16][16] ); ALIGNED_16( dctcoef dct8[4][64] ); - ALIGNED_16( dctcoef dctdc[2][4] ); + ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; x264_t *h = &h_buf; @@ -671,6 +678,7 @@ static int check_dct( int cpu_ref, int cpu_new ) TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 ); TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 ); TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 ); + TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 ); TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 ); report( "sub_dct4 :" ); @@ -757,6 +765,36 @@ static int check_dct( int cpu_ref, int cpu_new ) TEST_DCTDC( idct4x4dc ); #undef TEST_DCTDC +#define TEST_DCTDC_CHROMA( name )\ + ok = 1; used_asm = 0;\ + if( dct_asm.name != dct_ref.name )\ + {\ + set_func_name( #name );\ + used_asm = 1;\ + uint16_t *p = (uint16_t*)buf1;\ + for( int i = 0; i < 16 && ok; i++ )\ + {\ + for( int j = 0; j < 8; j++ )\ + dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\ + : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\ + : ((*p++)&0x1fff)-0x1000; /* general case */\ + memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\ + call_c1( dct_c.name, dctdc[0], dct1 );\ + call_a1( dct_asm.name, dctdc[1], dct2 );\ + if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\ + {\ + ok = 0;\ + fprintf( stderr, #name " [FAILED]\n" ); \ + }\ + }\ + call_c2( dct_c.name, dctdc[0], dct1 );\ + call_a2( dct_asm.name, dctdc[1], dct2 );\ + }\ + report( #name " :" ); + + TEST_DCTDC_CHROMA( dct2x4dc ); +#undef TEST_DCTDC_CHROMA + x264_zigzag_function_t zigzag_c[2]; x264_zigzag_function_t zigzag_ref[2]; x264_zigzag_function_t zigzag_asm[2]; @@ -986,7 +1024,7 @@ static int check_mc( int cpu_ref, int cpu_new ) #define MC_TEST_AVG( name, weight ) \ { \ ok = 1, used_asm = 0; \ - for( int i = 0; i < 10; i++ ) \ + for( int i = 0; i < 12; i++ ) \ { \ memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \ memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \ @@ -1085,34 +1123,49 @@ static int check_mc( int cpu_ref, int cpu_new ) report( "mc offsetsub :" ); ok = 1; used_asm = 0; - if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 ) - { - set_func_name( "store_interleave_8x8x2" ); - used_asm = 1; - memset( pbuf3, 0, 64*8 ); - memset( pbuf4, 0, 64*8 ); - call_c( mc_c.store_interleave_8x8x2, pbuf3, 64, pbuf1, pbuf1+16 ); - call_a( mc_a.store_interleave_8x8x2, pbuf4, 64, pbuf1, pbuf1+16 ); - if( memcmp( pbuf3, pbuf4, 64*8 ) ) - ok = 0; - } - if( mc_a.load_deinterleave_8x8x2_fenc != mc_ref.load_deinterleave_8x8x2_fenc ) - { - set_func_name( "load_deinterleave_8x8x2_fenc" ); - used_asm = 1; - call_c( mc_c.load_deinterleave_8x8x2_fenc, pbuf3, pbuf1, 64 ); - call_a( mc_a.load_deinterleave_8x8x2_fenc, pbuf4, pbuf1, 64 ); - if( memcmp( pbuf3, pbuf4, FENC_STRIDE*8 ) ) - ok = 0; - } - if( mc_a.load_deinterleave_8x8x2_fdec != mc_ref.load_deinterleave_8x8x2_fdec ) + for( int height = 8; height <= 16; height += 8 ) { - set_func_name( "load_deinterleave_8x8x2_fdec" ); - used_asm = 1; - call_c( mc_c.load_deinterleave_8x8x2_fdec, pbuf3, pbuf1, 64 ); - call_a( mc_a.load_deinterleave_8x8x2_fdec, pbuf4, pbuf1, 64 ); - if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*8 ) ) - ok = 0; + if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma ) + { + set_func_name( "store_interleave_chroma" ); + used_asm = 1; + memset( pbuf3, 0, 64*height ); + memset( pbuf4, 0, 64*height ); + call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height ); + call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height ); + if( memcmp( pbuf3, pbuf4, 64*height ) ) + { + ok = 0; + fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height ); + break; + } + } + if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc ) + { + set_func_name( "load_deinterleave_chroma_fenc" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height ); + call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height ); + if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) ) + { + ok = 0; + fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height ); + break; + } + } + if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec ) + { + set_func_name( "load_deinterleave_chroma_fdec" ); + used_asm = 1; + call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height ); + call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height ); + if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) ) + { + ok = 0; + fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height ); + break; + } + } } report( "store_interleave :" ); @@ -1411,11 +1464,13 @@ static int check_deblock( int cpu_ref, int cpu_new ) TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] ); TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] ); - TEST_DEBLOCK( deblock_chroma[0], 0, tcs[i] ); + TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] ); + TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] ); TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] ); TEST_DEBLOCK( deblock_luma_intra[0], 0 ); TEST_DEBLOCK( deblock_luma_intra[1], 1 ); - TEST_DEBLOCK( deblock_chroma_intra[0], 0 ); + TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 ); + TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 ); TEST_DEBLOCK( deblock_chroma_intra[1], 1 ); if( db_a.deblock_strength != db_ref.deblock_strength ) @@ -1471,6 +1526,8 @@ static int check_quant( int cpu_ref, int cpu_new ) x264_quant_function_t qf_a; ALIGNED_16( dctcoef dct1[64] ); ALIGNED_16( dctcoef dct2[64] ); + ALIGNED_16( dctcoef dct3[8][16] ); + ALIGNED_16( dctcoef dct4[8][16] ); ALIGNED_16( uint8_t cqm_buf[64] ); int ret = 0, ok, used_asm; int oks[3] = {1,1,1}, used_asms[3] = {0,0,0}; @@ -1602,7 +1659,7 @@ static int check_quant( int cpu_ref, int cpu_new ) for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ INIT_QUANT##w(1) \ - call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ @@ -1631,7 +1688,7 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ for( int i = 0; i < 16; i++ ) \ dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \ - call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \ + qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \ memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \ call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \ @@ -1647,27 +1704,75 @@ static int check_quant( int cpu_ref, int cpu_new ) TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 ); -#define TEST_OPTIMIZE_CHROMA_DC( qname, optname, w ) \ + if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc ) + { + set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" ); + used_asms[1] = 1; + for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) + { + for( int i = 0; i < 8; i++ ) + dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; + qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); + qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); + call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 ); + call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 ); + for( int i = 0; i < 8; i++ ) + if( dct3[i][0] != dct4[i][0] ) + { + oks[1] = 0; + fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); + break; + } + } + } + + if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly ) + { + set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" ); + used_asms[1] = 1; + for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) + { + for( int i = 0; i < 8; i++ ) + dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; + qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); + qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 ); + memcpy( dct2, dct1, 8*sizeof(dctcoef) ); + call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 ); + call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 ); + if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) ) + { + oks[1] = 0; + fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); + break; + } + call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 ); + call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 ); + } + } + +#define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \ if( qf_a.optname != qf_ref.optname ) \ { \ set_func_name( #optname ); \ used_asms[2] = 1; \ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \ { \ - int dmf = h->dequant4_mf[CQM_4IC][qp%6][0] << qp/6; \ + int qpdc = qp + (size == 8 ? 3 : 0); \ + int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \ if( dmf > 32*64 ) \ continue; \ - for( int i = 16; ; i <<= 1 )\ + for( int i = 16; ; i <<= 1 ) \ { \ int res_c, res_asm; \ int max = X264_MIN( i, PIXEL_MAX*16 ); \ - for( int j = 0; j < w*w; j++ ) \ + for( int j = 0; j < size; j++ ) \ dct1[j] = rand()%(max*2+1) - max; \ - call_c1( qf_c.qname, dct1, h->quant4_mf[CQM_4IC][qp][0]>>1, h->quant4_bias[CQM_4IC][qp][0]>>1 ); \ - memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \ + for( int j = 0; i <= size; j += 4 ) \ + qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \ + memcpy( dct2, dct1, size*sizeof(dctcoef) ); \ res_c = call_c1( qf_c.optname, dct1, dmf ); \ res_asm = call_a1( qf_a.optname, dct2, dmf ); \ - if( res_c != res_asm || memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \ + if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \ { \ oks[2] = 0; \ fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \ @@ -1680,7 +1785,8 @@ static int check_quant( int cpu_ref, int cpu_new ) } \ } - TEST_OPTIMIZE_CHROMA_DC( quant_2x2_dc, optimize_chroma_dc, 2 ); + TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 ); + TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 ); x264_cqm_delete( h ); } @@ -1751,7 +1857,7 @@ static int check_quant( int cpu_ref, int cpu_new ) TEST_DECIMATE( decimate_score15, 4, 1, 7 ); report( "decimate_score :" ); -#define TEST_LAST( last, lastname, w, ac ) \ +#define TEST_LAST( last, lastname, size, ac ) \ if( qf_a.last != qf_ref.last ) \ { \ set_func_name( #lastname ); \ @@ -1759,8 +1865,8 @@ static int check_quant( int cpu_ref, int cpu_new ) for( int i = 0; i < 100; i++ ) \ { \ int nnz = 0; \ - int max = rand() & (w*w-1); \ - memset( dct1, 0, w*w*sizeof(dctcoef) ); \ + int max = rand() & (size-1); \ + memset( dct1, 0, size*sizeof(dctcoef) ); \ for( int idx = ac; idx < max; idx++ ) \ nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \ if( !nnz ) \ @@ -1777,13 +1883,14 @@ static int check_quant( int cpu_ref, int cpu_new ) } ok = 1; used_asm = 0; - TEST_LAST( coeff_last[DCT_CHROMA_DC], coeff_last4, 2, 0 ); - TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 4, 1 ); - TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 ); - TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 ); + TEST_LAST( coeff_last4 , coeff_last4, 4, 0 ); + TEST_LAST( coeff_last8 , coeff_last8, 8, 0 ); + TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 16, 1 ); + TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 ); + TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 ); report( "coeff_last :" ); -#define TEST_LEVELRUN( lastname, name, w, ac ) \ +#define TEST_LEVELRUN( lastname, name, size, ac ) \ if( qf_a.lastname != qf_ref.lastname ) \ { \ set_func_name( #name ); \ @@ -1792,8 +1899,8 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ x264_run_level_t runlevel_c, runlevel_a; \ int nnz = 0; \ - int max = rand() & (w*w-1); \ - memset( dct1, 0, w*w*sizeof(dctcoef) ); \ + int max = rand() & (size-1); \ + memset( dct1, 0, size*sizeof(dctcoef) ); \ memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \ memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \ for( int idx = ac; idx < max; idx++ ) \ @@ -1814,9 +1921,10 @@ static int check_quant( int cpu_ref, int cpu_new ) } ok = 1; used_asm = 0; - TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC], coeff_level_run4, 2, 0 ); - TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 4, 1 ); - TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 ); + TEST_LEVELRUN( coeff_level_run4 , coeff_level_run4, 4, 0 ); + TEST_LEVELRUN( coeff_level_run8 , coeff_level_run8, 8, 0 ); + TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 16, 1 ); + TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 ); report( "coeff_level_run :" ); return ret; @@ -1832,6 +1940,7 @@ static int check_intra( int cpu_ref, int cpu_new ) { x264_predict_t predict_16x16[4+3]; x264_predict_t predict_8x8c[4+3]; + x264_predict_t predict_8x16c[4+3]; x264_predict8x8_t predict_8x8[9+3]; x264_predict_t predict_4x4[9+3]; x264_predict_8x8_filter_t predict_8x8_filter; @@ -1839,16 +1948,19 @@ static int check_intra( int cpu_ref, int cpu_new ) x264_predict_16x16_init( 0, ip_c.predict_16x16 ); x264_predict_8x8c_init( 0, ip_c.predict_8x8c ); + x264_predict_8x16c_init( 0, ip_c.predict_8x16c ); x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter ); x264_predict_4x4_init( 0, ip_c.predict_4x4 ); x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 ); x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c ); + x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c ); x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter ); x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 ); x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 ); x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c ); + x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c ); x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter ); x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 ); @@ -1856,7 +1968,7 @@ static int check_intra( int cpu_ref, int cpu_new ) ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); -#define INTRA_TEST( name, dir, w, align, bench, ... )\ +#define INTRA_TEST( name, dir, w, h, align, bench, ... )\ if( ip_a.name[dir] != ip_ref.name[dir] )\ {\ set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\ @@ -1874,7 +1986,7 @@ static int check_intra( int cpu_ref, int cpu_new ) for( int k = -1; k < 16; k++ )\ printf( "%2x ", edge[16+k] );\ printf( "\n" );\ - for( int j = 0; j < w; j++ )\ + for( int j = 0; j < h; j++ )\ {\ printf( "%2x ", edge[14-j] );\ for( int k = 0; k < w; k++ )\ @@ -1882,7 +1994,7 @@ static int check_intra( int cpu_ref, int cpu_new ) printf( "\n" );\ }\ printf( "\n" );\ - for( int j = 0; j < w; j++ )\ + for( int j = 0; j < h; j++ )\ {\ printf( " " );\ for( int k = 0; k < w; k++ )\ @@ -1895,13 +2007,15 @@ static int check_intra( int cpu_ref, int cpu_new ) } for( int i = 0; i < 12; i++ ) - INTRA_TEST( predict_4x4, i, 4, 4, ); + INTRA_TEST( predict_4x4, i, 4, 4, 4, ); + for( int i = 0; i < 7; i++ ) + INTRA_TEST( predict_8x8c, i, 8, 8, 16, ); for( int i = 0; i < 7; i++ ) - INTRA_TEST( predict_8x8c, i, 8, 16, ); + INTRA_TEST( predict_8x16c, i, 8, 16, 16, ); for( int i = 0; i < 7; i++ ) - INTRA_TEST( predict_16x16, i, 16, 16, ); + INTRA_TEST( predict_16x16, i, 16, 16, 16, ); for( int i = 0; i < 12; i++ ) - INTRA_TEST( predict_8x8, i, 8, 8, , edge ); + INTRA_TEST( predict_8x8, i, 8, 8, 8, , edge ); set_func_name("intra_predict_8x8_filter"); if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter ) @@ -1926,31 +2040,33 @@ static int check_intra( int cpu_ref, int cpu_new ) } } -#define EXTREMAL_PLANE(size) \ +#define EXTREMAL_PLANE( w, h ) \ { \ int max[7]; \ for( int j = 0; j < 7; j++ ) \ max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \ fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \ - for( int j = 0; j < size/2; j++ ) \ + for( int j = 0; j < w/2; j++ ) \ fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \ - for( int j = size/2; j < size-1; j++ ) \ + for( int j = w/2; j < w-1; j++ ) \ fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \ - fdec[48+(size-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \ - for( int j = 0; j < size/2; j++ ) \ + fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \ + for( int j = 0; j < h/2; j++ ) \ fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \ - for( int j = size/2; j < size-1; j++ ) \ + for( int j = h/2; j < h-1; j++ ) \ fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \ - fdec[48+(size-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \ + fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \ } /* Extremal test case for planar prediction. */ for( int test = 0; test < 100 && ok; test++ ) for( int i = 0; i < 128 && ok; i++ ) { - EXTREMAL_PLANE( 8 ); - INTRA_TEST( predict_8x8c, I_PRED_CHROMA_P, 8, 64, 1 ); - EXTREMAL_PLANE( 16 ); - INTRA_TEST( predict_16x16, I_PRED_16x16_P, 16, 64, 1 ); + EXTREMAL_PLANE( 8, 8 ); + INTRA_TEST( predict_8x8c, I_PRED_CHROMA_P, 8, 8, 64, 1 ); + EXTREMAL_PLANE( 8, 16 ); + INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P, 8, 16, 64, 1 ); + EXTREMAL_PLANE( 16, 16 ); + INTRA_TEST( predict_16x16, I_PRED_16x16_P, 16, 16, 64, 1 ); } report( "intra pred :" ); return ret; diff --git a/x264.c b/x264.c index 72399569..025bc767 100644 --- a/x264.c +++ b/x264.c @@ -121,7 +121,7 @@ static const char * const muxer_names[] = static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 }; static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 }; -static const char * const output_csp_names[] = { "i420", "i444", "rgb", 0 }; +static const char * const output_csp_names[] = { "i420", "i422", "i444", "rgb", 0 }; typedef struct { @@ -1131,6 +1131,8 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info, int csp = info->csp & X264_CSP_MASK; if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) ) param->i_csp = X264_CSP_I420; + else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) ) + param->i_csp = X264_CSP_I422; else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) ) param->i_csp = X264_CSP_I444; else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) ) @@ -1355,7 +1357,8 @@ static int parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt ) case OPT_OUTPUT_CSP: FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg ) // correct the parsed value to the libx264 csp value - output_csp = !output_csp ? X264_CSP_I420 : (output_csp == 1 ? X264_CSP_I444 : X264_CSP_RGB); + static const uint8_t output_csp_fix[] = { X264_CSP_I420, X264_CSP_I422, X264_CSP_I444, X264_CSP_RGB }; + param->i_csp = output_csp = output_csp_fix[output_csp]; break; default: generic_option: diff --git a/x264.h b/x264.h index 2cdcfb7c..34ad872c 100644 --- a/x264.h +++ b/x264.h @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 117 +#define X264_BUILD 118 /* x264_t: * opaque handler for encoder */ @@ -181,12 +181,15 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; #define X264_CSP_I420 0x0001 /* yuv 4:2:0 planar */ #define X264_CSP_YV12 0x0002 /* yvu 4:2:0 planar */ #define X264_CSP_NV12 0x0003 /* yuv 4:2:0, with one y plane and one packed u+v */ -#define X264_CSP_I444 0x0004 /* yuv 4:4:4 planar */ -#define X264_CSP_YV24 0x0005 /* yvu 4:4:4 planar */ -#define X264_CSP_BGR 0x0006 /* packed bgr 24bits */ -#define X264_CSP_BGRA 0x0007 /* packed bgr 32bits */ -#define X264_CSP_RGB 0x0008 /* packed rgb 24bits */ -#define X264_CSP_MAX 0x0009 /* end of list */ +#define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */ +#define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */ +#define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */ +#define X264_CSP_I444 0x0007 /* yuv 4:4:4 planar */ +#define X264_CSP_YV24 0x0008 /* yvu 4:4:4 planar */ +#define X264_CSP_BGR 0x0009 /* packed bgr 24bits */ +#define X264_CSP_BGRA 0x000a /* packed bgr 32bits */ +#define X264_CSP_RGB 0x000b /* packed rgb 24bits */ +#define X264_CSP_MAX 0x000c /* end of list */ #define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */ #define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */ @@ -242,7 +245,7 @@ typedef struct x264_param_t /* Video Properties */ int i_width; int i_height; - int i_csp; /* CSP of encoded bitstream, only i420 supported */ + int i_csp; /* CSP of encoded bitstream */ int i_level_idc; int i_frame_total; /* number of frames to encode if known, else 0 */ @@ -579,7 +582,7 @@ void x264_param_apply_fastfirstpass( x264_param_t * ); /* x264_param_apply_profile: * Applies the restrictions of the given profile. * Currently available profiles are, from most to least restrictive: */ -static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", 0 }; +static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", "high422", "high444", 0 }; /* (can be NULL, in which case the function will do nothing) * -- 2.40.0