From fb660325d99298ab6cd2285d76f2fddf83fe34cb Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Wed, 2 Jul 2008 20:59:24 -0600 Subject: [PATCH] denoise_dct asm --- common/common.h | 4 ++-- common/quant.c | 19 +++++++++++++++ common/quant.h | 2 ++ common/x86/quant-a.asm | 53 ++++++++++++++++++++++++++++++++++++++++++ common/x86/quant.h | 3 +++ encoder/analyse.c | 2 +- encoder/macroblock.c | 37 ++++------------------------- encoder/macroblock.h | 1 - tools/checkasm.c | 21 +++++++++++++++++ 9 files changed, 105 insertions(+), 37 deletions(-) diff --git a/common/common.h b/common/common.h index 33b71c40..f46aee52 100644 --- a/common/common.h +++ b/common/common.h @@ -293,8 +293,8 @@ struct x264_t uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */ uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */ - uint32_t nr_residual_sum[2][64]; - uint32_t nr_offset[2][64]; + DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] ); + DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] ); uint32_t nr_count[2]; /* Slice header */ diff --git a/common/quant.c b/common/quant.c index 635d32cb..80ae7112 100644 --- a/common/quant.c +++ b/common/quant.c @@ -193,6 +193,20 @@ void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_q } } +void x264_denoise_dct_core( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) +{ + int i; + for( i=1; i>15; + level = (level+sign)^sign; + sum[i] += level; + level -= offset[i]; + dct[i] = level<0 ? 0 : (level^sign)-sign; + } +} + void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { pf->quant_8x8 = quant_8x8; @@ -203,6 +217,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = dequant_4x4; pf->dequant_8x8 = dequant_8x8; + pf->denoise_dct_core = x264_denoise_dct_core; + #ifdef HAVE_MMX if( cpu&X264_CPU_MMX ) { @@ -216,6 +232,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx; pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx; } + pf->denoise_dct_core = x264_denoise_dct_core_mmx; #endif } @@ -239,6 +256,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2; pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2; } + pf->denoise_dct_core = x264_denoise_dct_core_sse2; } if( cpu&X264_CPU_SSSE3 ) @@ -247,6 +265,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3; pf->quant_4x4 = x264_quant_4x4_ssse3; pf->quant_8x8 = x264_quant_8x8_ssse3; + pf->denoise_dct_core = x264_denoise_dct_core_ssse3; } #endif // HAVE_MMX diff --git a/common/quant.h b/common/quant.h index 2e48c48d..58518f79 100644 --- a/common/quant.h +++ b/common/quant.h @@ -32,6 +32,8 @@ typedef struct void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); + + void (*denoise_dct_core)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); } x264_quant_function_t; void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ); diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index d660db14..20db71eb 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -328,3 +328,56 @@ INIT_XMM DEQUANT sse2, 4, 4, 2 DEQUANT sse2, 8, 6, 2 + + +;----------------------------------------------------------------------------- +; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ) +;----------------------------------------------------------------------------- +%macro DENOISE_DCT 1 +cglobal x264_denoise_dct_core_%1, 4,5 + movzx r4d, word [r0] ; backup DC coefficient + pxor m7, m7 +.loop: + sub r3, regsize + mova m2, [r0+r3*2+0*regsize] + mova m3, [r0+r3*2+1*regsize] + PABSW m0, m2 + PABSW m1, m3 + mova m4, m0 + mova m5, m1 + psubusw m0, [r2+r3*2+0*regsize] + psubusw m1, [r2+r3*2+1*regsize] + PSIGNW m0, m2 + PSIGNW m1, m3 + mova [r0+r3*2+0*regsize], m0 + mova [r0+r3*2+1*regsize], m1 + mova m2, m4 + mova m3, m5 + punpcklwd m4, m7 + punpckhwd m2, m7 + punpcklwd m5, m7 + punpckhwd m3, m7 + paddd m4, [r1+r3*4+0*regsize] + paddd m2, [r1+r3*4+1*regsize] + paddd m5, [r1+r3*4+2*regsize] + paddd m3, [r1+r3*4+3*regsize] + mova [r1+r3*4+0*regsize], m4 + mova [r1+r3*4+1*regsize], m2 + mova [r1+r3*4+2*regsize], m5 + mova [r1+r3*4+3*regsize], m3 + jg .loop + mov [r0], r4w ; restore DC coefficient + RET +%endmacro + +%define PABSW PABSW_MMX +%define PSIGNW PSIGNW_MMX +%ifndef ARCH_X86_64 +INIT_MMX +DENOISE_DCT mmx +%endif +INIT_XMM +DENOISE_DCT sse2 +%define PABSW PABSW_SSSE3 +%define PSIGNW PSIGNW_SSSE3 +DENOISE_DCT ssse3 diff --git a/common/x86/quant.h b/common/x86/quant.h index b55ea395..c7a560d7 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -42,5 +42,8 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], in void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); +void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); +void x264_denoise_dct_core_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); +void x264_denoise_dct_core_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size ); #endif diff --git a/encoder/analyse.c b/encoder/analyse.c index d9ff0bc3..6e0f9f12 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -2593,7 +2593,7 @@ void x264_macroblock_analyse( x264_t *h ) x264_mb_analyse_transform( h ); h->mb.b_trellis = h->param.analyse.i_trellis; - h->mb.b_noise_reduction = h->param.analyse.i_noise_reduction; + h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction; if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction ) h->mb.i_skip_intra = 0; } diff --git a/encoder/macroblock.c b/encoder/macroblock.c index 66d034cb..403b8587 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -443,11 +443,12 @@ void x264_macroblock_encode( x264_t *h ) DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] ); b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] ); + h->nr_count[1] += h->mb.b_noise_reduction * 4; for( idx = 0; idx < 4; idx++ ) { if( h->mb.b_noise_reduction ) - x264_denoise_dct( h, (int16_t*)dct8x8[idx] ); + h->quantf.denoise_dct_core( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 ); if( h->mb.b_trellis ) x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 ); else @@ -482,6 +483,7 @@ void x264_macroblock_encode( x264_t *h ) { DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] ); h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] ); + h->nr_count[0] += h->mb.b_noise_reduction * 16; for( i8x8 = 0; i8x8 < 4; i8x8++ ) { @@ -494,7 +496,7 @@ void x264_macroblock_encode( x264_t *h ) idx = i8x8 * 4 + i4x4; if( h->mb.b_noise_reduction ) - x264_denoise_dct( h, (int16_t*)dct4x4[idx] ); + h->quantf.denoise_dct_core( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 ); if( h->mb.b_trellis ) x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 ); else @@ -738,37 +740,6 @@ void x264_noise_reduction_update( x264_t *h ) } } -void x264_denoise_dct( x264_t *h, int16_t *dct ) -{ - const int cat = h->mb.b_transform_8x8; - int i; - - h->nr_count[cat]++; - - for( i = (cat ? 63 : 15); i >= 1; i-- ) - { - int level = dct[i]; - if( level ) - { - if( level > 0 ) - { - h->nr_residual_sum[cat][i] += level; - level -= h->nr_offset[cat][i]; - if( level < 0 ) - level = 0; - } - else - { - h->nr_residual_sum[cat][i] -= level; - level += h->nr_offset[cat][i]; - if( level > 0 ) - level = 0; - } - dct[i] = level; - } - } -} - /***************************************************************************** * RD only; 4 calls to this do not make up for one macroblock_encode. * doesn't transform chroma dc. diff --git a/encoder/macroblock.h b/encoder/macroblock.h index ba7be690..d17a919a 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -55,7 +55,6 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat, int i_qp, int b_intra ); void x264_noise_reduction_update( x264_t *h ); -void x264_denoise_dct( x264_t *h, int16_t *dct ); #endif diff --git a/tools/checkasm.c b/tools/checkasm.c index 89798342..85fce259 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1023,6 +1023,27 @@ static int check_quant( int cpu_ref, int cpu_new ) ok = oks[1]; used_asm = used_asms[1]; report( "dequant :" ); + + if( qf_a.denoise_dct_core != qf_ref.denoise_dct_core ) + { + int size; + for( size = 16; size <= 64; size += 48 ) + { + set_func_name( "denoise_dct" ); + used_asm = 1; + memcpy(dct1, buf1, size*2); + memcpy(dct2, buf1, size*2); + memcpy(buf3+256, buf3, 256); + call_c1( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); + call_a1( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); + if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) ) + ok = 0; + call_c2( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size ); + call_a2( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size ); + } + } + report( "denoise dct :" ); + return ret; } -- 2.40.0