uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
- uint32_t nr_residual_sum[2][64];
- uint32_t nr_offset[2][64];
+ DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
+ DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
uint32_t nr_count[2];
/* Slice header */
}
}
+void x264_denoise_dct_core( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+{
+ int i;
+ for( i=1; i<size; i++ )
+ {
+ int level = dct[i];
+ int sign = level>>15;
+ level = (level+sign)^sign;
+ sum[i] += level;
+ level -= offset[i];
+ dct[i] = level<0 ? 0 : (level^sign)-sign;
+ }
+}
+
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
pf->dequant_4x4 = dequant_4x4;
pf->dequant_8x8 = dequant_8x8;
+ pf->denoise_dct_core = x264_denoise_dct_core;
+
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
{
pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
}
+ pf->denoise_dct_core = x264_denoise_dct_core_mmx;
#endif
}
pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
+ pf->denoise_dct_core = x264_denoise_dct_core_sse2;
}
if( cpu&X264_CPU_SSSE3 )
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
+ pf->denoise_dct_core = x264_denoise_dct_core_ssse3;
}
#endif // HAVE_MMX
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+
+ void (*denoise_dct_core)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
+
+
+;-----------------------------------------------------------------------------
+; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 1
+cglobal x264_denoise_dct_core_%1, 4,5
+ movzx r4d, word [r0] ; backup DC coefficient
+ pxor m7, m7
+.loop:
+ sub r3, regsize
+ mova m2, [r0+r3*2+0*regsize]
+ mova m3, [r0+r3*2+1*regsize]
+ PABSW m0, m2
+ PABSW m1, m3
+ mova m4, m0
+ mova m5, m1
+ psubusw m0, [r2+r3*2+0*regsize]
+ psubusw m1, [r2+r3*2+1*regsize]
+ PSIGNW m0, m2
+ PSIGNW m1, m3
+ mova [r0+r3*2+0*regsize], m0
+ mova [r0+r3*2+1*regsize], m1
+ mova m2, m4
+ mova m3, m5
+ punpcklwd m4, m7
+ punpckhwd m2, m7
+ punpcklwd m5, m7
+ punpckhwd m3, m7
+ paddd m4, [r1+r3*4+0*regsize]
+ paddd m2, [r1+r3*4+1*regsize]
+ paddd m5, [r1+r3*4+2*regsize]
+ paddd m3, [r1+r3*4+3*regsize]
+ mova [r1+r3*4+0*regsize], m4
+ mova [r1+r3*4+1*regsize], m2
+ mova [r1+r3*4+2*regsize], m5
+ mova [r1+r3*4+3*regsize], m3
+ jg .loop
+ mov [r0], r4w ; restore DC coefficient
+ RET
+%endmacro
+
+%define PABSW PABSW_MMX
+%define PSIGNW PSIGNW_MMX
+%ifndef ARCH_X86_64
+INIT_MMX
+DENOISE_DCT mmx
+%endif
+INIT_XMM
+DENOISE_DCT sse2
+%define PABSW PABSW_SSSE3
+%define PSIGNW PSIGNW_SSSE3
+DENOISE_DCT ssse3
void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+void x264_denoise_dct_core_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+void x264_denoise_dct_core_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
#endif
x264_mb_analyse_transform( h );
h->mb.b_trellis = h->param.analyse.i_trellis;
- h->mb.b_noise_reduction = h->param.analyse.i_noise_reduction;
+ h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
h->mb.i_skip_intra = 0;
}
DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
+ h->nr_count[1] += h->mb.b_noise_reduction * 4;
for( idx = 0; idx < 4; idx++ )
{
if( h->mb.b_noise_reduction )
- x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
+ h->quantf.denoise_dct_core( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
if( h->mb.b_trellis )
x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
else
{
DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
+ h->nr_count[0] += h->mb.b_noise_reduction * 16;
for( i8x8 = 0; i8x8 < 4; i8x8++ )
{
idx = i8x8 * 4 + i4x4;
if( h->mb.b_noise_reduction )
- x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
+ h->quantf.denoise_dct_core( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
if( h->mb.b_trellis )
x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
else
}
}
-void x264_denoise_dct( x264_t *h, int16_t *dct )
-{
- const int cat = h->mb.b_transform_8x8;
- int i;
-
- h->nr_count[cat]++;
-
- for( i = (cat ? 63 : 15); i >= 1; i-- )
- {
- int level = dct[i];
- if( level )
- {
- if( level > 0 )
- {
- h->nr_residual_sum[cat][i] += level;
- level -= h->nr_offset[cat][i];
- if( level < 0 )
- level = 0;
- }
- else
- {
- h->nr_residual_sum[cat][i] -= level;
- level += h->nr_offset[cat][i];
- if( level > 0 )
- level = 0;
- }
- dct[i] = level;
- }
- }
-}
-
/*****************************************************************************
* RD only; 4 calls to this do not make up for one macroblock_encode.
* doesn't transform chroma dc.
int i_qp, int b_intra );
void x264_noise_reduction_update( x264_t *h );
-void x264_denoise_dct( x264_t *h, int16_t *dct );
#endif
ok = oks[1]; used_asm = used_asms[1];
report( "dequant :" );
+
+ if( qf_a.denoise_dct_core != qf_ref.denoise_dct_core )
+ {
+ int size;
+ for( size = 16; size <= 64; size += 48 )
+ {
+ set_func_name( "denoise_dct" );
+ used_asm = 1;
+ memcpy(dct1, buf1, size*2);
+ memcpy(dct2, buf1, size*2);
+ memcpy(buf3+256, buf3, 256);
+ call_c1( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
+ call_a1( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+ if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
+ ok = 0;
+ call_c2( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
+ call_a2( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+ }
+ }
+ report( "denoise dct :" );
+
return ret;
}