From: Fiona Glaser Date: Fri, 30 Jan 2009 11:40:54 +0000 (-0800) Subject: Massive overhaul of nnz/cbp calculation X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e394bd600ba9b1a3cee24e7d0b01dfb0acc5d1ad;p=libx264 Massive overhaul of nnz/cbp calculation Modify quantization to also calculate array_non_zero. PPC assembly changes by gpoirior. New quant asm includes some small tweaks to quant and SSE4 versions using ptest for the array_non_zero. Use this new feature of quant to merge nnz/cbp calculation directly with encoding and avoid many unnecessary calls to dequant/zigzag/decimate/etc. Also add new i16x16 DC-only iDCT with asm. Since intra encoding now directly calculates nnz, skip_intra now backs up nnz/cbp as well. Output should be equivalent except when using p4x4+RDO because of a subtlety involving old nnz values lying around. Performance increase in macroblock_encode: ~18% with dct-decimate, 30% without at CRF 25. Overall performance increase 0-6% depending on encoding settings. --- diff --git a/common/common.h b/common/common.h index 78b1efb6..97c68781 100644 --- a/common/common.h +++ b/common/common.h @@ -471,6 +471,10 @@ struct x264_t DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] ); DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] ); DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] ); + uint32_t i4x4_nnz_buf[4]; + uint32_t i8x8_nnz_buf[4]; + int i4x4_cbp; + int i8x8_cbp; /* Psy trellis DCT data */ DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] ); diff --git a/common/dct.c b/common/dct.c index f8d51e40..5f9f0fb0 100644 --- a/common/dct.c +++ b/common/dct.c @@ -369,6 +369,18 @@ static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] ) add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] ); } +static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] ) +{ + int i; + for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE ) + { + add4x4_idct_dc( &p_dst[ 0], dct[i][0] ); + add4x4_idct_dc( &p_dst[ 4], dct[i][1] ); + add4x4_idct_dc( &p_dst[ 8], dct[i][2] ); + add4x4_idct_dc( &p_dst[12], dct[i][3] ); + } +} + /**************************************************************************** * x264_dct_init: @@ -384,6 +396,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub16x16_dct = sub16x16_dct; dctf->add16x16_idct = add16x16_idct; + dctf->add16x16_idct_dc = add16x16_idct_dc; dctf->sub8x8_dct8 = sub8x8_dct8; dctf->add8x8_idct8 = add8x8_idct8; @@ -400,6 +413,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub4x4_dct = x264_sub4x4_dct_mmx; dctf->add4x4_idct = x264_add4x4_idct_mmx; dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx; dctf->dct4x4dc = x264_dct4x4dc_mmx; dctf->idct4x4dc = x264_idct4x4dc_mmx; @@ -427,10 +441,14 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub16x16_dct = x264_sub16x16_dct_sse2; dctf->add8x8_idct = x264_add8x8_idct_sse2; dctf->add16x16_idct = x264_add16x16_idct_sse2; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2; } if( cpu&X264_CPU_SSSE3 ) + { dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3; + } #endif //HAVE_MMX #ifdef ARCH_PPC diff --git a/common/dct.h b/common/dct.h index f4474fcc..71951f9b 100644 --- a/common/dct.h +++ b/common/dct.h @@ -100,6 +100,7 @@ typedef struct void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 ); void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] ); + void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] ); void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 ); void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] ); diff --git a/common/ppc/quant.c b/common/ppc/quant.c index 64b34ab5..d1d9d72a 100644 --- a/common/ppc/quant.c +++ b/common/ppc/quant.c @@ -30,10 +30,10 @@ mfvA = vec_ld((idx0), mf); \ mfvB = vec_ld((idx1), mf); \ biasvA = vec_ld((idx0), bias); \ biasvB = vec_ld((idx1), bias); \ -mskA = vec_cmplt(temp1v, zerov); \ -mskB = vec_cmplt(temp2v, zerov); \ -coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \ -coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \ +mskA = vec_cmplt(temp1v, zero_s16v); \ +mskB = vec_cmplt(temp2v, zero_s16v); \ +coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v); \ +coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v); \ coefvA = vec_adds(coefvA, biasvA); \ coefvB = vec_adds(coefvB, biasvB); \ multEvenvA = vec_mule(coefvA, mfvA); \ @@ -51,17 +51,20 @@ temp2v = vec_xor(temp2v, mskB); \ temp1v = vec_adds(temp1v, vec_and(mskA, one)); \ vec_st(temp1v, (idx0), (int16_t*)dct); \ temp2v = vec_adds(temp2v, vec_and(mskB, one)); \ +nz = vec_or(nz, vec_or(temp1v, temp2v)); \ vec_st(temp2v, (idx1), (int16_t*)dct); -void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) +int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) { + LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; vec_u16_t biasvA; - vec_s16_t zerov, one; + vec_s16_t one = vec_splat_s16(1);; + vec_s16_t nz = zero_s16v; vector bool short mskB; vec_u16_t coefvB; @@ -75,20 +78,18 @@ void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[1 qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); - zerov = vec_splat_s16(0); - one = vec_splat_s16(1); - QUANT_16_U( 0, 16 ); + return vec_any_ne(nz, zero_s16v); } // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled" #define QUANT_16_U_DC( idx0, idx1 ) \ temp1v = vec_ld((idx0), *dct); \ temp2v = vec_ld((idx1), *dct); \ -mskA = vec_cmplt(temp1v, zerov); \ -mskB = vec_cmplt(temp2v, zerov); \ -coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \ -coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \ +mskA = vec_cmplt(temp1v, zero_s16v); \ +mskB = vec_cmplt(temp2v, zero_s16v); \ +coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\ +coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\ coefvA = vec_add(coefvA, biasv); \ coefvB = vec_add(coefvB, biasv); \ multEvenvA = vec_mule(coefvA, mfv); \ @@ -106,15 +107,18 @@ temp2v = vec_xor(temp2v, mskB); \ temp1v = vec_add(temp1v, vec_and(mskA, one)); \ vec_st(temp1v, (idx0), (int16_t*)dct); \ temp2v = vec_add(temp2v, vec_and(mskB, one)); \ +nz = vec_or(nz, vec_or(temp1v, temp2v)); \ vec_st(temp2v, (idx1), (int16_t*)dct); -void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ) +int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ) { + LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; - vec_s16_t zerov, one; + vec_s16_t one = vec_splat_s16(1); + vec_s16_t nz = zero_s16v; vector bool short mskB; vec_u16_t coefvB; @@ -137,18 +141,16 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ) bias_u.s[0]=bias; biasv = vec_splat(bias_u.v, 0); - zerov = vec_splat_s16(0); - one = vec_splat_s16(1); - QUANT_16_U_DC( 0, 16 ); + return vec_any_ne(nz, zero_s16v); } // DC quant of a whole 2x2 block #define QUANT_4_U_DC( idx0 ) \ const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \ temp1v = vec_ld((idx0), *dct); \ -mskA = vec_cmplt(temp1v, zerov); \ -coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \ +mskA = vec_cmplt(temp1v, zero_s16v); \ +coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\ coefvA = vec_add(coefvA, biasv); \ multEvenvA = vec_mule(coefvA, mfv); \ multOddvA = vec_mulo(coefvA, mfv); \ @@ -158,15 +160,18 @@ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(mul temp2v = vec_xor(temp2v, mskA); \ temp2v = vec_add(temp2v, vec_and(mskA, one)); \ temp1v = vec_sel(temp1v, temp2v, sel); \ +nz = vec_or(nz, temp1v); \ vec_st(temp1v, (idx0), (int16_t*)dct); -void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ) +int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ) { + LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; - vec_s16_t zerov, one; + vec_s16_t one = vec_splat_s16(1); + vec_s16_t nz = zero_s16v; vec_s16_t temp1v, temp2v; @@ -185,42 +190,41 @@ void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ) bias_u.s[0]=bias; biasv = vec_splat(bias_u.v, 0); - zerov = vec_splat_s16(0); - one = vec_splat_s16(1); - + static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0); QUANT_4_U_DC(0); + return vec_any_ne(vec_and(nz, mask2), zero_s16v); } -void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) +int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) { + LOAD_ZERO; vector bool short mskA; vec_u32_t i_qbitsv; vec_u16_t coefvA; vec_u32_t multEvenvA, multOddvA; vec_u16_t mfvA; vec_u16_t biasvA; - vec_s16_t zerov, one; - + vec_s16_t one = vec_splat_s16(1);; + vec_s16_t nz = zero_s16v; + vector bool short mskB; vec_u16_t coefvB; vec_u32_t multEvenvB, multOddvB; vec_u16_t mfvB; vec_u16_t biasvB; - + vec_s16_t temp1v, temp2v; vec_u32_u qbits_u; qbits_u.s[0]=16; i_qbitsv = vec_splat(qbits_u.v, 0); - - zerov = vec_splat_s16(0); - one = vec_splat_s16(1); int i; for ( i=0; i<4; i++ ) { QUANT_16_U( i*2*16, i*2*16+16 ); } + return vec_any_ne(nz, zero_s16v); } #define DEQUANT_SHL() \ diff --git a/common/ppc/quant.h b/common/ppc/quant.h index 05049003..f55a934a 100644 --- a/common/ppc/quant.h +++ b/common/ppc/quant.h @@ -21,11 +21,11 @@ #ifndef X264_PPC_QUANT_H #define X264_PPC_QUANT_H -void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); -void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); +int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); -void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ); -void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ); +int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ); +int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ); void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); diff --git a/common/quant.c b/common/quant.c index ac798a25..daf2b5a2 100644 --- a/common/quant.c +++ b/common/quant.c @@ -36,35 +36,41 @@ (coef) = (f + (coef)) * (mf) >> 16; \ else \ (coef) = - ((f - (coef)) * (mf) >> 16); \ + nz |= (coef); \ } -static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) +static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) { - int i; + int i, nz = 0; for( i = 0; i < 64; i++ ) QUANT_ONE( dct[0][i], mf[i], bias[i] ); + return !!nz; } -static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) +static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) { - int i; + int i, nz = 0; for( i = 0; i < 16; i++ ) QUANT_ONE( dct[0][i], mf[i], bias[i] ); + return !!nz; } -static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias ) +static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias ) { - int i; + int i, nz = 0; for( i = 0; i < 16; i++ ) QUANT_ONE( dct[0][i], mf, bias ); + return !!nz; } -static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias ) +static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias ) { + int nz = 0; QUANT_ONE( dct[0][0], mf, bias ); QUANT_ONE( dct[0][1], mf, bias ); QUANT_ONE( dct[0][2], mf, bias ); QUANT_ONE( dct[0][3], mf, bias ); + return !!nz; } #define DEQUANT_SHL( x ) \ @@ -402,6 +408,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score16 = x264_decimate_score16_ssse3; pf->decimate_score64 = x264_decimate_score64_ssse3; } + + if( cpu&X264_CPU_SSE4 ) + { + pf->quant_4x4_dc = x264_quant_4x4_dc_sse4; + pf->quant_4x4 = x264_quant_4x4_sse4; + pf->quant_8x8 = x264_quant_8x8_sse4; + } #endif // HAVE_MMX #ifdef ARCH_PPC diff --git a/common/quant.h b/common/quant.h index eaac5937..b8a7b988 100644 --- a/common/quant.h +++ b/common/quant.h @@ -25,10 +25,10 @@ typedef struct { - void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); - void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); - void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias ); - void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias ); + int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); + int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); + int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias ); + int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias ); void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index a474932f..156a7ae4 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -33,6 +33,7 @@ pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 +pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 SECTION .text @@ -324,6 +325,104 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2 movhps [r0+FDEC_STRIDE* 3], xmm5 ret +cglobal x264_add16x16_idct_dc_mmx, 2,3 + mov r2, 4 +.loop: + movq mm0, [r1] + pxor mm1, mm1 + paddw mm0, [pw_32 GLOBAL] + psraw mm0, 6 + psubw mm1, mm0 + packuswb mm0, mm0 + packuswb mm1, mm1 + punpcklbw mm0, mm0 + punpcklbw mm1, mm1 + pshufw mm2, mm0, 0xFA + pshufw mm3, mm1, 0xFA + punpcklbw mm0, mm0 + punpcklbw mm1, mm1 + ADD_DC mm0, mm1, r0 + ADD_DC mm2, mm3, r0+8 + add r1, 8 + add r0, FDEC_STRIDE*4 + dec r2 + jg .loop + ret + +%macro IDCT_DC_STORE 3 + movdqa xmm4, [r0+%1+FDEC_STRIDE*0] + movdqa xmm5, [r0+%1+FDEC_STRIDE*1] + movdqa xmm6, [r0+%1+FDEC_STRIDE*2] + movdqa xmm7, [r0+%1+FDEC_STRIDE*3] + paddusb xmm4, %2 + paddusb xmm5, %2 + paddusb xmm6, %2 + paddusb xmm7, %2 + psubusb xmm4, %3 + psubusb xmm5, %3 + psubusb xmm6, %3 + psubusb xmm7, %3 + movdqa [r0+%1+FDEC_STRIDE*0], xmm4 + movdqa [r0+%1+FDEC_STRIDE*1], xmm5 + movdqa [r0+%1+FDEC_STRIDE*2], xmm6 + movdqa [r0+%1+FDEC_STRIDE*3], xmm7 +%endmacro + +cglobal x264_add16x16_idct_dc_sse2, 2,2 + call .loop + add r0, FDEC_STRIDE*4 +.loop: + add r0, FDEC_STRIDE*4 + movq xmm0, [r1+0] + movq xmm2, [r1+8] + add r1, 16 + punpcklwd xmm0, xmm0 + punpcklwd xmm2, xmm2 + pxor xmm1, xmm1 + pxor xmm3, xmm3 + paddw xmm0, [pw_32 GLOBAL] + paddw xmm2, [pw_32 GLOBAL] + psraw xmm0, 6 + psraw xmm2, 6 + psubw xmm1, xmm0 + psubw xmm3, xmm2 + packuswb xmm0, xmm1 + packuswb xmm2, xmm3 + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm0, xmm0 + punpcklbw xmm2, xmm2 + punpckhbw xmm1, xmm1 + punpckhbw xmm3, xmm3 + IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1 + IDCT_DC_STORE 0, xmm2, xmm3 + ret + +cglobal x264_add16x16_idct_dc_ssse3, 2,2 + call .loop + add r0, FDEC_STRIDE*4 +.loop: + add r0, FDEC_STRIDE*4 + movdqa xmm0, [r1] + add r1, 16 + pxor xmm1, xmm1 + paddw xmm0, [pw_32 GLOBAL] + psraw xmm0, 6 + psubw xmm1, xmm0 + movdqa xmm5, [ pb_idctdc_unpack GLOBAL] + movdqa xmm6, [pb_idctdc_unpack2 GLOBAL] + packuswb xmm0, xmm0 + packuswb xmm1, xmm1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pshufb xmm0, xmm5 + pshufb xmm2, xmm6 + pshufb xmm1, xmm5 + pshufb xmm3, xmm6 + IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1 + IDCT_DC_STORE 0, xmm2, xmm3 + ret + ;----------------------------------------------------------------------------- ; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] ) ;----------------------------------------------------------------------------- diff --git a/common/x86/dct.h b/common/x86/dct.h index d30fa972..99392761 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -34,9 +34,12 @@ void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] ); void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] ); void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] ); +void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] ); void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] ); void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] ); +void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] ); void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] ); +void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] ); void x264_dct4x4dc_mmx ( int16_t d[4][4] ); void x264_idct4x4dc_mmx ( int16_t d[4][4] ); diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index d1fd8693..d1b39919 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -29,6 +29,7 @@ SECTION_RODATA pb_1: times 16 db 1 pw_1: times 8 dw 1 pd_1: times 4 dd 1 +pb_01: times 8 db 0, 1 %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -70,7 +71,7 @@ decimate_mask_table4: SECTION .text -%macro QUANT_DC_START 0 +%macro QUANT_DC_START_MMX 0 movd m6, r1m ; mf movd m7, r2m ; bias %ifidn m0, mm0 @@ -84,6 +85,14 @@ SECTION .text %endif %endmacro +%macro QUANT_DC_START_SSSE3 0 + movdqa m5, [pb_01 GLOBAL] + movd m6, r1m ; mf + movd m7, r2m ; bias + pshufb m6, m5 + pshufb m7, m5 +%endmacro + %macro PABSW_MMX 2 pxor %1, %1 pcmpgtw %1, %2 @@ -105,7 +114,7 @@ SECTION .text psignw %1, %2 %endmacro -%macro QUANT_ONE 3 +%macro QUANT_ONE 4 ;;; %1 (m64) dct[y][x] ;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t) ;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t) @@ -115,6 +124,62 @@ SECTION .text pmulhuw m0, %2 ; divide PSIGNW m0, m1 ; restore sign mova %1, m0 ; store +%if %4 + por m5, m0 +%else + SWAP m5, m0 +%endif +%endmacro + +%macro QUANT_TWO 7 + mova m1, %1 + mova m3, %2 + PABSW m0, m1 + PABSW m2, m3 + paddusw m0, %5 + paddusw m2, %6 + pmulhuw m0, %3 + pmulhuw m2, %4 + PSIGNW m0, m1 + PSIGNW m2, m3 + mova %1, m0 + mova %2, m2 +%if %7 + por m5, m0 + por m5, m2 +%else + SWAP m5, m0 + por m5, m2 +%endif +%endmacro + +%macro QUANT_END_MMX 0 + xor eax, eax +%ifndef ARCH_X86_64 +%if mmsize==8 + packsswb m5, m5 + movd ecx, m5 + test ecx, ecx +%else + pxor m4, m4 + pcmpeqb m5, m4 + pmovmskb ecx, m5 + cmp ecx, (1<pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8]; int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); int i_cost = 0; + h->mb.i_cbp_luma = 0; b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0]; // FIXME some bias like in i4x4? @@ -732,6 +733,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i8x8_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); + h->mb.pic.i8x8_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; + h->mb.pic.i8x8_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; + h->mb.pic.i8x8_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; + h->mb.pic.i8x8_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i8x8_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i8x8_dct_buf, h->dct.luma8x8, sizeof(h->mb.pic.i8x8_dct_buf) ); } @@ -751,6 +757,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ { int i_cost; int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 ); + h->mb.i_cbp_luma = 0; b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0]; if( a->i_mbrd ) i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8; @@ -817,6 +824,11 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.i4x4_fdec_buf, 16, p_dst, FDEC_STRIDE, 16 ); + h->mb.pic.i4x4_nnz_buf[0] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]]; + h->mb.pic.i4x4_nnz_buf[1] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]]; + h->mb.pic.i4x4_nnz_buf[2] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]]; + h->mb.pic.i4x4_nnz_buf[3] = *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]]; + h->mb.pic.i4x4_cbp = h->mb.i_cbp_luma; if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->mb.pic.i4x4_dct_buf, h->dct.luma4x4, sizeof(h->mb.pic.i4x4_dct_buf) ); } @@ -1951,6 +1963,8 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd ) x264_macroblock_cache_ref( h, 2, 0, 2, 2, 0, a->l0.me8x8[1].i_ref ); x264_macroblock_cache_ref( h, 0, 2, 2, 2, 0, a->l0.me8x8[2].i_ref ); x264_macroblock_cache_ref( h, 2, 2, 2, 2, 0, a->l0.me8x8[3].i_ref ); + /* FIXME: In the 8x8 blocks where RDO isn't run, the NNZ values used for context selection + * for future blocks are those left over from previous RDO calls. */ for( i = 0; i < 4; i++ ) { int costs[4] = {a->l0.i_cost4x4[i], a->l0.i_cost8x4[i], a->l0.i_cost4x8[i], a->l0.me8x8[i].cost}; diff --git a/encoder/cabac.c b/encoder/cabac.c index 4fa74033..2015da5e 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -1142,20 +1142,10 @@ static void x264_subpartition_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, i static void x264_partition_i8x8_size_cabac( x264_t *h, x264_cabac_t *cb, int i8, int i_mode ) { const int i_pred = x264_mb_predict_intra4x4_mode( h, 4*i8 ); - const int nnz = array_non_zero(h->dct.luma8x8[i8]); i_mode = x264_mb_pred_mode4x4_fix( i_mode ); x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode ); - if( nnz ) - { - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0x0101; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101; + if( h->mb.i_cbp_luma & (1 << i8) ) block_residual_write_cabac_8x8( h, cb, 4*i8, h->dct.luma8x8[i8] ); - } - else - { - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4]] = 0; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0; - } } static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, int i_mode ) @@ -1163,7 +1153,6 @@ static void x264_partition_i4x4_size_cabac( x264_t *h, x264_cabac_t *cb, int i4, const int i_pred = x264_mb_predict_intra4x4_mode( h, i4 ); i_mode = x264_mb_pred_mode4x4_fix( i_mode ); x264_cabac_mb_intra4x4_pred_mode( cb, i_pred, i_mode ); - h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] ); block_residual_write_cabac( h, cb, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 ); } diff --git a/encoder/cavlc.c b/encoder/cavlc.c index 4f4ff033..e499fac5 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -702,7 +702,6 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode ) static int x264_partition_i4x4_size_cavlc( x264_t *h, int i4, int i_mode ) { h->out.bs.i_bits_encoded = cavlc_intra4x4_pred_size( h, i4, i_mode ); - h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] ); block_residual_write_cavlc( h, &h->out.bs, DCT_LUMA_4x4, i4, h->dct.luma4x4[i4], 16 ); return h->out.bs.i_bits_encoded; } diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 30df7781..6faa305e 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -84,26 +84,38 @@ static inline void dct2x2dc( int16_t d[2][2], int16_t dct4x4[4][4][4] ) dct4x4[3][0][0] = 0; } -static ALWAYS_INLINE void x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) +static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, int16_t dct[4][4], int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_4IY : CQM_4PY; if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx ); + return x264_quant_4x4_trellis( h, dct, i_quant_cat, i_qp, i_ctxBlockCat, b_intra, idx ); else - h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); + return h->quantf.quant_4x4( dct, h->quant4_mf[i_quant_cat][i_qp], h->quant4_bias[i_quant_cat][i_qp] ); } -static ALWAYS_INLINE void x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx ) +static ALWAYS_INLINE int x264_quant_8x8( x264_t *h, int16_t dct[8][8], int i_qp, int b_intra, int idx ) { int i_quant_cat = b_intra ? CQM_8IY : CQM_8PY; if( h->mb.b_trellis ) - x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx ); + return x264_quant_8x8_trellis( h, dct, i_quant_cat, i_qp, b_intra, idx ); else - h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] ); + return h->quantf.quant_8x8( dct, h->quant8_mf[i_quant_cat][i_qp], h->quant8_bias[i_quant_cat][i_qp] ); } +/* All encoding functions must output the correct CBP and NNZ values. + * The entropy coding functions will check CBP first, then NNZ, before + * actually reading the DCT coefficients. NNZ still must be correct even + * if CBP is zero because of the use of NNZ values for context selection. + * "NNZ" need only be 0 or 1 rather than the exact coefficient count because + * that is only needed in CAVLC, and will be calculated by CAVLC's residual + * coding and stored as necessary. */ + +/* This means that decimation can be done merely by adjusting the CBP and NNZ + * rather than memsetting the coefficients. */ + void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ) { + int nz; uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]]; uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]]; DECLARE_ALIGNED_16( int16_t dct4x4[4][4] ); @@ -111,29 +123,36 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ) if( h->mb.b_lossless ) { h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst ); + nz = array_non_zero( h->dct.luma4x4[idx] ); + h->mb.cache.non_zero_count[x264_scan8[idx]] = nz; + h->mb.i_cbp_luma |= nz<<(idx>>2); return; } h->dctf.sub4x4_dct( dct4x4, p_src, p_dst ); - x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx ); - - if( array_non_zero( dct4x4 ) ) + nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 1, idx ); + h->mb.cache.non_zero_count[x264_scan8[idx]] = nz; + if( nz ) { + h->mb.i_cbp_luma |= 1<<(idx>>2); h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4 ); h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4IY], i_qp ); - - /* output samples to fdec */ h->dctf.add4x4_idct( p_dst, dct4x4 ); } - else - memset( h->dct.luma4x4[idx], 0, sizeof(h->dct.luma4x4[idx])); +} + +#define STORE_8x8_NNZ(idx,nz)\ +{\ + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+0]] = nz * 0x0101;\ + *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[idx*4+2]] = nz * 0x0101;\ } void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ) { int x = 8 * (idx&1); int y = 8 * (idx>>1); + int nz; uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE]; uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE]; DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); @@ -141,16 +160,25 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ) if( h->mb.b_lossless ) { h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst ); + nz = array_non_zero( h->dct.luma8x8[idx] ); + STORE_8x8_NNZ(idx,nz); + h->mb.i_cbp_luma |= nz<dctf.sub8x8_dct8( dct8x8, p_src, p_dst ); - x264_quant_8x8( h, dct8x8, i_qp, 1, idx ); - + nz = x264_quant_8x8( h, dct8x8, i_qp, 1, idx ); h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8 ); - h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp ); - h->dctf.add8x8_idct8( p_dst, dct8x8 ); + if( nz ) + { + h->mb.i_cbp_luma |= 1<quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8IY], i_qp ); + h->dctf.add8x8_idct8( p_dst, dct8x8 ); + STORE_8x8_NNZ(idx,1); + } + else + STORE_8x8_NNZ(idx,0); } static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) @@ -161,7 +189,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] ); - int i; + int i, nz; if( h->mb.b_lossless ) { @@ -172,12 +200,18 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od ); dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0]; h->dct.luma4x4[i][0] = 0; + nz = array_non_zero( h->dct.luma4x4[i] ); + h->mb.cache.non_zero_count[x264_scan8[i]] = nz; + h->mb.i_cbp_luma |= nz; } + h->mb.i_cbp_luma *= 0xf; + h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( dct_dc4x4 ); h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 ); return; } h->dctf.sub16x16_dct( dct4x4, p_src, p_dst ); + for( i = 0; i < 16; i++ ) { /* copy dc coeff */ @@ -185,36 +219,45 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) dct4x4[i][0][0] = 0; /* quant/scan/dequant */ - x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i ); - - h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] ); - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp ); + nz = x264_quant_4x4( h, dct4x4[i], i_qp, DCT_LUMA_AC, 1, i ); + h->mb.cache.non_zero_count[x264_scan8[i]] = nz; + if( nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[i], dct4x4[i] ); + h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IY], i_qp ); + h->mb.i_cbp_luma = 0xf; + } } h->dctf.dct4x4dc( dct_dc4x4 ); if( h->mb.b_trellis ) - x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1); + nz = x264_quant_dc_trellis( h, (int16_t*)dct_dc4x4, CQM_4IY, i_qp, DCT_LUMA_DC, 1); else - h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 ); - h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 ); + nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[CQM_4IY][i_qp][0]>>1, h->quant4_bias[CQM_4IY][i_qp][0]<<1 ); - /* output samples to fdec */ - h->dctf.idct4x4dc( dct_dc4x4 ); - h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */ - - /* calculate dct coeffs */ - for( i = 0; i < 16; i++ ) + h->mb.cache.non_zero_count[x264_scan8[24]] = nz; + if( nz ) { - /* copy dc coeff */ - dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]]; + h->zigzagf.scan_4x4( h->dct.luma16x16_dc, dct_dc4x4 ); + + /* output samples to fdec */ + h->dctf.idct4x4dc( dct_dc4x4 ); + h->quantf.dequant_4x4_dc( dct_dc4x4, h->dequant4_mf[CQM_4IY], i_qp ); /* XXX not inversed */ + if( h->mb.i_cbp_luma ) + for( i = 0; i < 16; i++ ) + dct4x4[i][0][0] = dct_dc4x4[0][block_idx_xy_1d[i]]; } + /* put pixels to fdec */ - h->dctf.add16x16_idct( p_dst, dct4x4 ); + if( h->mb.i_cbp_luma ) + h->dctf.add16x16_idct( p_dst, dct4x4 ); + else if( nz ) + h->dctf.add16x16_idct_dc( p_dst, dct_dc4x4 ); } void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) { - int i, ch, nz; + int i, ch, nz, nz_dc; int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate); h->mb.i_cbp_chroma = 0; @@ -223,6 +266,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) uint8_t *p_src = h->mb.pic.p_fenc[1+ch]; uint8_t *p_dst = h->mb.pic.p_fdec[1+ch]; int i_decimate_score = 0; + int nz_ac = 0; DECLARE_ALIGNED_16( int16_t dct2x2[2][2] ); DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); @@ -250,52 +294,49 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) for( i = 0; i < 4; i++ ) { if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 ); + nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 0 ); else - h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); - h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] ); - - if( b_decimate ) - i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] ); + nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] ); + h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz; + if( nz ) + { + nz_ac = 1; + h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*4], dct4x4[i] ); + h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp ); + if( b_decimate ) + i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*4] ); + } } if( h->mb.b_trellis ) - x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter ); + nz_dc = x264_quant_dc_trellis( h, (int16_t*)dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter ); else - h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 ); + nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 ); - if( b_decimate && i_decimate_score < 7 ) + h->mb.cache.non_zero_count[x264_scan8[25]+ch] = nz_dc; + + if( (b_decimate && i_decimate_score < 7) || !nz_ac ) { /* Decimate the block */ h->mb.cache.non_zero_count[x264_scan8[16+0]+24*ch] = 0; h->mb.cache.non_zero_count[x264_scan8[16+1]+24*ch] = 0; h->mb.cache.non_zero_count[x264_scan8[16+2]+24*ch] = 0; h->mb.cache.non_zero_count[x264_scan8[16+3]+24*ch] = 0; - if( !array_non_zero( dct2x2 ) ) /* Whole block is empty */ - { - h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 0; + if( !nz_dc ) /* Whole block is empty */ continue; - } /* DC-only */ - h->mb.cache.non_zero_count[x264_scan8[25]+ch] = 1; zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); idct_dequant_2x2_dconly( dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); h->dctf.add8x8_idct_dc( p_dst, dct2x2 ); } else { - for( i = 0; i < 4; i++ ) + h->mb.i_cbp_chroma = 1; + if( nz_dc ) { - nz = array_non_zero( h->dct.luma4x4[16+ch*4+i] ); - h->mb.cache.non_zero_count[x264_scan8[16+ch*4+i]] = nz; - h->mb.i_cbp_chroma |= nz; - if( nz ) - h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp ); + zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); + idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); } - /* Don't optimize for the AC-only case--it's very rare */ - h->mb.cache.non_zero_count[x264_scan8[25]+ch] = array_non_zero( dct2x2 ); - zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 ); - idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp ); h->dctf.add8x8_idct( p_dst, dct4x4 ); } } @@ -423,8 +464,9 @@ void x264_macroblock_encode( x264_t *h ) int i_qp = h->mb.i_qp; int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate; int b_force_no_skip = 0; - int i,j,idx; - uint8_t nnz8x8[4] = {1,1,1,1}; + int i,idx,nz; + h->mb.i_cbp_luma = 0; + h->mb.cache.non_zero_count[x264_scan8[24]] = 0; if( h->sh.b_mbaff && h->mb.i_mb_xy == h->sh.i_first_mb + h->mb.i_mb_stride @@ -479,6 +521,11 @@ void x264_macroblock_encode( x264_t *h ) if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i8x8_fdec_buf, 16, 16 ); + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i8x8_nnz_buf[0]; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i8x8_nnz_buf[1]; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i8x8_nnz_buf[2]; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i8x8_nnz_buf[3]; + h->mb.i_cbp_luma = h->mb.pic.i8x8_cbp; /* In RD mode, restore the now-overwritten DCT data. */ if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->dct.luma8x8, h->mb.pic.i8x8_dct_buf, sizeof(h->mb.pic.i8x8_dct_buf) ); @@ -496,8 +543,6 @@ void x264_macroblock_encode( x264_t *h ) x264_mb_encode_i8x8( h, i, i_qp ); } - for( i = 0; i < 4; i++ ) - nnz8x8[i] = array_non_zero( h->dct.luma8x8[i] ); } else if( h->mb.i_type == I_4x4 ) { @@ -506,6 +551,11 @@ void x264_macroblock_encode( x264_t *h ) if( h->mb.i_skip_intra ) { h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[0], FDEC_STRIDE, h->mb.pic.i4x4_fdec_buf, 16, 16 ); + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = h->mb.pic.i4x4_nnz_buf[0]; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = h->mb.pic.i4x4_nnz_buf[1]; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = h->mb.pic.i4x4_nnz_buf[2]; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = h->mb.pic.i4x4_nnz_buf[3]; + h->mb.i_cbp_luma = h->mb.pic.i4x4_cbp; /* In RD mode, restore the now-overwritten DCT data. */ if( h->mb.i_skip_intra == 2 ) h->mc.memcpy_aligned( h->dct.luma4x4, h->mb.pic.i4x4_dct_buf, sizeof(h->mb.pic.i4x4_dct_buf) ); @@ -545,7 +595,9 @@ void x264_macroblock_encode( x264_t *h ) h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE, h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE ); - nnz8x8[i8x8] = array_non_zero( h->dct.luma8x8[i8x8] ); + nz = array_non_zero( h->dct.luma8x8[i8x8] ); + STORE_8x8_NNZ(i8x8,nz); + h->mb.i_cbp_luma |= nz << i8x8; } else for( i4x4 = 0; i4x4 < 16; i4x4++ ) @@ -553,6 +605,9 @@ void x264_macroblock_encode( x264_t *h ) h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4], h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4], h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] ); + nz = array_non_zero( h->dct.luma4x4[i4x4] ); + h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz; + h->mb.i_cbp_luma |= nz << (i4x4>>2); } } else if( h->mb.b_transform_8x8 ) @@ -566,31 +621,44 @@ void x264_macroblock_encode( x264_t *h ) { if( h->mb.b_noise_reduction ) h->quantf.denoise_dct( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); - x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx ); + nz = x264_quant_8x8( h, dct8x8[idx], i_qp, 0, idx ); - h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] ); - - if( b_decimate ) + if( nz ) { - int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] ); - i_decimate_mb += i_decimate_8x8; - if( i_decimate_8x8 < 4 ) - nnz8x8[idx] = 0; + h->zigzagf.scan_8x8( h->dct.luma8x8[idx], dct8x8[idx] ); + if( b_decimate ) + { + int i_decimate_8x8 = h->quantf.decimate_score64( h->dct.luma8x8[idx] ); + i_decimate_mb += i_decimate_8x8; + if( i_decimate_8x8 >= 4 ) + h->mb.i_cbp_luma |= 1<mb.i_cbp_luma |= 1<mb.i_cbp_luma = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0; + } else { for( idx = 0; idx < 4; idx++ ) - if( nnz8x8[idx] ) + { + if( h->mb.i_cbp_luma&(1<quantf.dequant_8x8( dct8x8[idx], h->dequant8_mf[CQM_8PY], i_qp ); h->dctf.add8x8_idct8( &h->mb.pic.p_fdec[0][(idx&1)*8 + (idx>>1)*8*FDEC_STRIDE], dct8x8[idx] ); + STORE_8x8_NNZ(idx,1); } + else + STORE_8x8_NNZ(idx,0); + } } } else @@ -601,41 +669,61 @@ void x264_macroblock_encode( x264_t *h ) for( i8x8 = 0; i8x8 < 4; i8x8++ ) { - int i_decimate_8x8; + int i_decimate_8x8 = 0; + int cbp = 0; /* encode one 4x4 block */ - i_decimate_8x8 = 0; for( i4x4 = 0; i4x4 < 4; i4x4++ ) { idx = i8x8 * 4 + i4x4; if( h->mb.b_noise_reduction ) h->quantf.denoise_dct( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); - x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx ); + nz = x264_quant_4x4( h, dct4x4[idx], i_qp, DCT_LUMA_4x4, 0, idx ); + h->mb.cache.non_zero_count[x264_scan8[idx]] = nz; - h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); - - if( b_decimate && i_decimate_8x8 < 6 ) - i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] ); + if( nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[idx], dct4x4[idx] ); + h->quantf.dequant_4x4( dct4x4[idx], h->dequant4_mf[CQM_4PY], i_qp ); + if( b_decimate && i_decimate_8x8 < 6 ) + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[idx] ); + cbp = 1; + } } /* decimate this 8x8 block */ i_decimate_mb += i_decimate_8x8; - if( i_decimate_8x8 < 4 && b_decimate ) - nnz8x8[i8x8] = 0; + if( b_decimate ) + { + if( i_decimate_8x8 < 4 ) + STORE_8x8_NNZ(i8x8,0) + else + h->mb.i_cbp_luma |= 1<dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] ); + h->mb.i_cbp_luma |= 1<quantf.dequant_4x4( dct4x4[i8x8*4+i], h->dequant4_mf[CQM_4PY], i_qp ); - h->dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] ); - } + if( i_decimate_mb < 6 ) + { + h->mb.i_cbp_luma = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 0]] = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 2]] = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[ 8]] = 0; + *(uint32_t*)&h->mb.cache.non_zero_count[x264_scan8[10]] = 0; + } + else + { + for( i8x8 = 0; i8x8 < 4; i8x8++ ) + if( h->mb.i_cbp_luma&(1<dctf.add8x8_idct( &h->mb.pic.p_fdec[0][(i8x8&1)*8 + (i8x8>>1)*8*FDEC_STRIDE], &dct4x4[i8x8*4] ); + } } } } @@ -656,49 +744,6 @@ void x264_macroblock_encode( x264_t *h ) /* encode the 8x8 blocks */ x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp ); - /* coded block pattern and non_zero_count */ - h->mb.i_cbp_luma = 0x00; - if( h->mb.i_type == I_16x16 ) - { - for( i = 0; i < 16; i++ ) - { - int nz = array_non_zero( h->dct.luma4x4[i] ); - h->mb.cache.non_zero_count[x264_scan8[i]] = nz; - h->mb.i_cbp_luma |= nz; - } - h->mb.i_cbp_luma *= 0xf; - h->mb.cache.non_zero_count[x264_scan8[24]] = array_non_zero( h->dct.luma16x16_dc ); - } - else - { - for( i = 0; i < 4; i++) - { - if(!nnz8x8[i]) - { - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+i*4]] = 0; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+i*4]] = 0; - } - else if( h->mb.b_transform_8x8 ) - { - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[0+4*i]] = nnz8x8[i] * 0x0101; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[2+4*i]] = nnz8x8[i] * 0x0101; - h->mb.i_cbp_luma |= nnz8x8[i] << i; - } - else - { - int nz, cbp = 0; - for( j = 0; j < 4; j++ ) - { - nz = array_non_zero( h->dct.luma4x4[j+4*i] ); - h->mb.cache.non_zero_count[x264_scan8[j+4*i]] = nz; - cbp |= nz; - } - h->mb.i_cbp_luma |= cbp << i; - } - } - h->mb.cache.non_zero_count[x264_scan8[24]] = 0; - } - if( h->param.b_cabac ) { i_cbp_dc = h->mb.cache.non_zero_count[x264_scan8[24]] @@ -770,8 +815,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) /* encode one 4x4 block */ for( i4x4 = 0; i4x4 < 4; i4x4++ ) { - h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ); - if( !array_non_zero(dct4x4[i4x4]) ) + if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PY][i_qp], h->quant4_bias[CQM_4PY][i_qp] ) ) continue; h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); i_decimate_mb += h->quantf.decimate_score16( dctscan ); @@ -805,15 +849,13 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir ) /* calculate dct DC */ dct2x2dc( dct2x2, dct4x4 ); - h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ); - if( array_non_zero(dct2x2) ) + if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) ) return 0; /* calculate dct coeffs */ for( i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ ) { - h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); - if( !array_non_zero(dct4x4[i4x4]) ) + if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) ) continue; h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] ); i_decimate_mb += h->quantf.decimate_score15( dctscan ); @@ -865,7 +907,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) uint8_t *p_fdec = h->mb.pic.p_fdec[0] + (i8&1)*8 + (i8>>1)*8*FDEC_STRIDE; int b_decimate = h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate; int nnz8x8 = 0; - int ch; + int ch, nz; x264_mb_mc_8x8( h, i8 ); @@ -876,8 +918,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec ); nnz8x8 = array_non_zero( h->dct.luma8x8[i8] ); - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101 * nnz8x8; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101 * nnz8x8; + STORE_8x8_NNZ(i8,nnz8x8); } else { @@ -898,9 +939,8 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE; h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec ); h->dct.luma4x4[16+i8+ch*4][0] = 0; + h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] ); } - h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero( h->dct.luma4x4[16+i8] ); - h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero( h->dct.luma4x4[20+i8] ); } else { @@ -908,67 +948,53 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) { DECLARE_ALIGNED_16( int16_t dct8x8[8][8] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); - x264_quant_8x8( h, dct8x8, i_qp, 0, i8 ); - h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); - - if( b_decimate && !h->mb.b_trellis ) - nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] ); - else - nnz8x8 = array_non_zero( dct8x8 ); - + nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 ); if( nnz8x8 ) { - h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp ); - h->dctf.add8x8_idct8( p_fdec, dct8x8 ); - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0x0101; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0x0101; + h->zigzagf.scan_8x8( h->dct.luma8x8[i8], dct8x8 ); + + if( b_decimate && !h->mb.b_trellis ) + nnz8x8 = 4 <= h->quantf.decimate_score64( h->dct.luma8x8[i8] ); + + if( nnz8x8 ) + { + h->quantf.dequant_8x8( dct8x8, h->dequant8_mf[CQM_8PY], i_qp ); + h->dctf.add8x8_idct8( p_fdec, dct8x8 ); + STORE_8x8_NNZ(i8,1); + } + else + STORE_8x8_NNZ(i8,0); } else - { - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0; - } + STORE_8x8_NNZ(i8,0); } else { int i4; + int i_decimate_8x8 = 0; DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] ); h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); for( i4 = 0; i4 < 4; i4++ ) - x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 ); - - for( i4 = 0; i4 < 4; i4++ ) - h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] ); - - if( b_decimate ) { - int i_decimate_8x8 = 0; - for( i4 = 0; i4 < 4 && i_decimate_8x8 < 4; i4++ ) - i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] ); - nnz8x8 = 4 <= i_decimate_8x8; + nz = x264_quant_4x4( h, dct4x4[i4], i_qp, DCT_LUMA_4x4, 0, i8*4+i4 ); + h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = nz; + if( nz ) + { + h->zigzagf.scan_4x4( h->dct.luma4x4[i8*4+i4], dct4x4[i4] ); + h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp ); + if( b_decimate ) + i_decimate_8x8 += h->quantf.decimate_score16( h->dct.luma4x4[i8*4+i4] ); + nnz8x8 = 1; + } } - else - nnz8x8 = array_non_zero( dct4x4 ); + + if( b_decimate && i_decimate_8x8 < 4 ) + nnz8x8 = 0; if( nnz8x8 ) - { - for( i4 = 0; i4 < 4; i4++ ) - { - if( array_non_zero( dct4x4[i4] ) ) - { - h->quantf.dequant_4x4( dct4x4[i4], h->dequant4_mf[CQM_4PY], i_qp ); - h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 1; - } - else - h->mb.cache.non_zero_count[x264_scan8[i8*4+i4]] = 0; - } h->dctf.add8x8_idct( p_fdec, dct4x4 ); - } else - { - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+0]] = 0; - *(uint16_t*)&h->mb.cache.non_zero_count[x264_scan8[i8*4+2]] = 0; - } + STORE_8x8_NNZ(i8,0); } i_qp = h->mb.i_chroma_qp; @@ -983,19 +1009,17 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) dct4x4[0][0] = 0; if( h->mb.b_trellis ) - x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 ); + nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 0 ); else - h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); + nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ); - if( array_non_zero( dct4x4 ) ) + h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz; + if( nz ) { h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*4], dct4x4 ); h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp ); h->dctf.add4x4_idct( p_fdec, dct4x4 ); - h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 1; } - else - h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = 0; } } h->mb.i_cbp_luma &= ~(1 << i8); @@ -1014,6 +1038,7 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 ) const int i_ref = h->mb.cache.ref[0][x264_scan8[i4]]; const int mvx = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][0], h->mb.mv_min[0], h->mb.mv_max[0] ); const int mvy = x264_clip3( h->mb.cache.mv[0][x264_scan8[i4]][1], h->mb.mv_min[1], h->mb.mv_max[1] ); + int nz; h->mc.mc_luma( p_fdec, FDEC_STRIDE, h->mb.pic.p_fref[0][i_ref], h->mb.pic.i_stride[0], mvx + 4*4*block_idx_x[i4], mvy + 4*4*block_idx_y[i4], 4, 4 ); @@ -1026,15 +1051,13 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 ) { DECLARE_ALIGNED_16( int16_t dct4x4[4][4] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); - x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 ); - if( array_non_zero( dct4x4 ) ) + nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 ); + h->mb.cache.non_zero_count[x264_scan8[i4]] = nz; + if( nz ) { h->zigzagf.scan_4x4( h->dct.luma4x4[i4], dct4x4 ); h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PY], i_qp ); h->dctf.add4x4_idct( p_fdec, dct4x4 ); - h->mb.cache.non_zero_count[x264_scan8[i4]] = 1; } - else - h->mb.cache.non_zero_count[x264_scan8[i4]] = 0; } } diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 4cc599aa..7b9f08a3 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -55,11 +55,11 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ); void x264_cabac_mb_skip( x264_t *h, int b_skip ); -void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat, +int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra ); -void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, +int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int idx ); -void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, +int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, int i_qp, int b_intra, int idx ); void x264_noise_reduction_update( x264_t *h ); diff --git a/encoder/rdo.c b/encoder/rdo.c index 1ba2a715..76cfdcaf 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -215,6 +215,8 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel ) if( i_pixel > PIXEL_8x8 ) return x264_rd_cost_subpart( h, i_lambda2, i4, i_pixel ); + h->mb.i_cbp_luma = 0; + x264_macroblock_encode_p8x8( h, i8 ); if( i_pixel == PIXEL_16x8 ) x264_macroblock_encode_p8x8( h, i8+1 ); @@ -243,6 +245,8 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel ) static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode ) { uint64_t i_ssd, i_bits; + h->mb.i_cbp_luma = 0; + h->mb.b_transform_8x8 = 1; x264_mb_encode_i8x8( h, i8, h->mb.i_qp ); i_ssd = ssd_plane( h, PIXEL_8x8, 0, (i8&1)*8, (i8>>1)*8 ); @@ -404,7 +408,7 @@ typedef struct { // comparable to the input. so unquant is the direct inverse of quant, // and uses the dct scaling factors, not the idct ones. -static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct, +static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, int16_t *dct, const uint16_t *quant_mf, const int *unquant_mf, const int *coef_weight, const uint8_t *zigzag, int i_ctxBlockCat, int i_lambda2, int b_ac, int dc, int i_coefs, int idx ) @@ -419,7 +423,7 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct, const int b_interlaced = h->mb.b_interlaced; const int f = 1 << 15; // no deadzone int i_last_nnz; - int i, j; + int i, j, nz; // (# of coefs) * (# of ctx) * (# of levels tried) = 1024 // we don't need to keep all of those: (# of coefs) * (# of ctx) would be enough, @@ -438,7 +442,7 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct, if( i < b_ac ) { memset( dct, 0, i_coefs * sizeof(*dct) ); - return; + return 0; } i_last_nnz = i; @@ -613,39 +617,42 @@ static ALWAYS_INLINE void quant_trellis_cabac( x264_t *h, int16_t *dct, bnode = &nodes_cur[j]; j = bnode->level_idx; + nz = 0; for( i = b_ac; i < i_coefs; i++ ) { dct[zigzag[i]] = level_tree[j].abs_level * signs[i]; + nz |= level_tree[j].abs_level; j = level_tree[j].next; } + return !!nz; } const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3}; -void x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat, +int x264_quant_dc_trellis( x264_t *h, int16_t *dct, int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra ) { - quant_trellis_cabac( h, (int16_t*)dct, + return quant_trellis_cabac( h, (int16_t*)dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, i_ctxBlockCat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[h->mb.b_interlaced], i_ctxBlockCat, lambda2_tab[b_intra][i_qp], 0, 1, i_ctxBlockCat==DCT_CHROMA_DC ? 4 : 16, 0 ); } -void x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, +int x264_quant_4x4_trellis( x264_t *h, int16_t dct[4][4], int i_quant_cat, int i_qp, int i_ctxBlockCat, int b_intra, int idx ) { int b_ac = (i_ctxBlockCat == DCT_LUMA_AC || i_ctxBlockCat == DCT_CHROMA_AC); - quant_trellis_cabac( h, (int16_t*)dct, + return quant_trellis_cabac( h, (int16_t*)dct, h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], x264_dct4_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan4[h->mb.b_interlaced], i_ctxBlockCat, lambda2_tab[b_intra][i_qp], b_ac, 0, 16, idx ); } -void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, +int x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, int i_qp, int b_intra, int idx ) { - quant_trellis_cabac( h, (int16_t*)dct, + return quant_trellis_cabac( h, (int16_t*)dct, h->quant8_mf[i_quant_cat][i_qp], h->unquant8_mf[i_quant_cat][i_qp], x264_dct8_weight2_zigzag[h->mb.b_interlaced], x264_zigzag_scan8[h->mb.b_interlaced], diff --git a/tools/checkasm.c b/tools/checkasm.c index 9bc802ad..3f89e681 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -558,6 +558,7 @@ static int check_dct( int cpu_ref, int cpu_new ) TEST_IDCT( add8x8_idct, dct4 ); TEST_IDCT( add8x8_idct_dc, dct4 ); TEST_IDCT( add16x16_idct, dct4 ); + TEST_IDCT( add16x16_idct_dc, dct4 ); report( "add_idct4 :" ); ok = 1; used_asm = 0; @@ -958,7 +959,7 @@ static int check_quant( int cpu_ref, int cpu_new ) DECLARE_ALIGNED_16( uint8_t cqm_buf[64] ); int ret = 0, ok, used_asm; int oks[2] = {1,1}, used_asms[2] = {0,0}; - int i, i_cqm, qp; + int i, j, i_cqm, qp; x264_t h_buf; x264_t *h = &h_buf; memset( h, 0, sizeof(*h) ); @@ -1007,7 +1008,7 @@ static int check_quant( int cpu_ref, int cpu_new ) for( x = 0; x < 8; x++ ) \ { \ unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \ - dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \ + dct1[y*8+x] = dct2[y*8+x] = j ? (rand()%(2*scale+1))-scale : 0; \ } \ } @@ -1019,7 +1020,7 @@ static int check_quant( int cpu_ref, int cpu_new ) for( x = 0; x < 4; x++ ) \ { \ unsigned int scale = 255*scale1d[y]*scale1d[x]; \ - dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \ + dct1[y*4+x] = dct2[y*4+x] = j ? (rand()%(2*scale+1))-scale : 0; \ } \ } @@ -1030,18 +1031,22 @@ static int check_quant( int cpu_ref, int cpu_new ) used_asms[0] = 1; \ for( qp = 51; qp > 0; qp-- ) \ { \ - for( i = 0; i < 16; i++ ) \ - dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \ - call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - if( memcmp( dct1, dct2, 16*2 ) ) \ + for( j = 0; j < 2; j++ ) \ { \ - oks[0] = 0; \ - fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ - break; \ + int result_c, result_a; \ + for( i = 0; i < 16; i++ ) \ + dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \ + result_c = call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + result_a = call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \ + { \ + oks[0] = 0; \ + fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \ + break; \ + } \ + call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ + call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ } \ - call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ - call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \ } \ } @@ -1052,17 +1057,21 @@ static int check_quant( int cpu_ref, int cpu_new ) used_asms[0] = 1; \ for( qp = 51; qp > 0; qp-- ) \ { \ - INIT_QUANT##w() \ - call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - if( memcmp( dct1, dct2, w*w*2 ) ) \ + for( j = 0; j < 2; j++ ) \ { \ - oks[0] = 0; \ - fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ - break; \ + int result_c, result_a; \ + INIT_QUANT##w() \ + result_c = call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + result_a = call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \ + { \ + oks[0] = 0; \ + fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \ + break; \ + } \ + call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ + call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ } \ - call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ - call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \ } \ } @@ -1078,6 +1087,7 @@ static int check_quant( int cpu_ref, int cpu_new ) { \ set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \ used_asms[1] = 1; \ + j = 1; \ for( qp = 51; qp > 0; qp-- ) \ { \ INIT_QUANT##w() \