From fb660325d99298ab6cd2285d76f2fddf83fe34cb Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Wed, 2 Jul 2008 20:59:24 -0600
Subject: [PATCH] denoise_dct asm

---
 common/common.h        |  4 ++--
 common/quant.c         | 19 +++++++++++++++
 common/quant.h         |  2 ++
 common/x86/quant-a.asm | 53 ++++++++++++++++++++++++++++++++++++++++++
 common/x86/quant.h     |  3 +++
 encoder/analyse.c      |  2 +-
 encoder/macroblock.c   | 37 ++++-------------------------
 encoder/macroblock.h   |  1 -
 tools/checkasm.c       | 21 +++++++++++++++++
 9 files changed, 105 insertions(+), 37 deletions(-)

diff --git a/common/common.h b/common/common.h
index 33b71c40..f46aee52 100644
--- a/common/common.h
+++ b/common/common.h
@@ -293,8 +293,8 @@ struct x264_t
     uint16_t        (*quant4_bias[4])[16];   /* [4][52][16] */
     uint16_t        (*quant8_bias[2])[64];   /* [2][52][64] */
 
-    uint32_t        nr_residual_sum[2][64];
-    uint32_t        nr_offset[2][64];
+    DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
+    DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
     uint32_t        nr_count[2];
 
     /* Slice header */
diff --git a/common/quant.c b/common/quant.c
index 635d32cb..80ae7112 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -193,6 +193,20 @@ void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_q
     }
 }
 
+void x264_denoise_dct_core( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+{
+    int i;
+    for( i=1; i<size; i++ )
+    {
+        int level = dct[i];
+        int sign = level>>15;
+        level = (level+sign)^sign;
+        sum[i] += level;
+        level -= offset[i];
+        dct[i] = level<0 ? 0 : (level^sign)-sign;
+    }
+}
+
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 {
     pf->quant_8x8 = quant_8x8;
@@ -203,6 +217,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     pf->dequant_4x4 = dequant_4x4;
     pf->dequant_8x8 = dequant_8x8;
 
+    pf->denoise_dct_core = x264_denoise_dct_core;
+
 #ifdef HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -216,6 +232,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_4x4 = x264_dequant_4x4_flat16_mmx;
             pf->dequant_8x8 = x264_dequant_8x8_flat16_mmx;
         }
+        pf->denoise_dct_core = x264_denoise_dct_core_mmx;
 #endif
     }
 
@@ -239,6 +256,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
             pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
         }
+        pf->denoise_dct_core = x264_denoise_dct_core_sse2;
     }
 
     if( cpu&X264_CPU_SSSE3 )
@@ -247,6 +265,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
         pf->quant_4x4 = x264_quant_4x4_ssse3;
         pf->quant_8x8 = x264_quant_8x8_ssse3;
+        pf->denoise_dct_core = x264_denoise_dct_core_ssse3;
     }
 #endif // HAVE_MMX
 
diff --git a/common/quant.h b/common/quant.h
index 2e48c48d..58518f79 100644
--- a/common/quant.h
+++ b/common/quant.h
@@ -32,6 +32,8 @@ typedef struct
 
     void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
     void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+
+    void (*denoise_dct_core)( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
 } x264_quant_function_t;
 
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index d660db14..20db71eb 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -328,3 +328,56 @@ INIT_XMM
 DEQUANT sse2, 4, 4, 2
 DEQUANT sse2, 8, 6, 2
 
+
+
+;-----------------------------------------------------------------------------
+; void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
+;-----------------------------------------------------------------------------
+%macro DENOISE_DCT 1
+cglobal x264_denoise_dct_core_%1, 4,5
+    movzx     r4d, word [r0] ; backup DC coefficient
+    pxor      m7, m7
+.loop:
+    sub       r3, regsize
+    mova      m2, [r0+r3*2+0*regsize]
+    mova      m3, [r0+r3*2+1*regsize]
+    PABSW     m0, m2
+    PABSW     m1, m3
+    mova      m4, m0
+    mova      m5, m1
+    psubusw   m0, [r2+r3*2+0*regsize]
+    psubusw   m1, [r2+r3*2+1*regsize]
+    PSIGNW    m0, m2
+    PSIGNW    m1, m3
+    mova      [r0+r3*2+0*regsize], m0
+    mova      [r0+r3*2+1*regsize], m1
+    mova      m2, m4
+    mova      m3, m5
+    punpcklwd m4, m7
+    punpckhwd m2, m7
+    punpcklwd m5, m7
+    punpckhwd m3, m7
+    paddd     m4, [r1+r3*4+0*regsize]
+    paddd     m2, [r1+r3*4+1*regsize]
+    paddd     m5, [r1+r3*4+2*regsize]
+    paddd     m3, [r1+r3*4+3*regsize]
+    mova      [r1+r3*4+0*regsize], m4
+    mova      [r1+r3*4+1*regsize], m2
+    mova      [r1+r3*4+2*regsize], m5
+    mova      [r1+r3*4+3*regsize], m3
+    jg .loop
+    mov       [r0], r4w ; restore DC coefficient
+    RET
+%endmacro
+
+%define PABSW PABSW_MMX
+%define PSIGNW PSIGNW_MMX
+%ifndef ARCH_X86_64
+INIT_MMX
+DENOISE_DCT mmx
+%endif
+INIT_XMM
+DENOISE_DCT sse2
+%define PABSW PABSW_SSSE3
+%define PSIGNW PSIGNW_SSSE3
+DENOISE_DCT ssse3
diff --git a/common/x86/quant.h b/common/x86/quant.h
index b55ea395..c7a560d7 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -42,5 +42,8 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], in
 void x264_dequant_8x8_flat16_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
 void x264_dequant_4x4_flat16_sse2( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
 void x264_dequant_8x8_flat16_sse2( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
+void x264_denoise_dct_core_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+void x264_denoise_dct_core_sse2( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
+void x264_denoise_dct_core_ssse3( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );
 
 #endif
diff --git a/encoder/analyse.c b/encoder/analyse.c
index d9ff0bc3..6e0f9f12 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -2593,7 +2593,7 @@ void x264_macroblock_analyse( x264_t *h )
         x264_mb_analyse_transform( h );
 
     h->mb.b_trellis = h->param.analyse.i_trellis;
-    h->mb.b_noise_reduction = h->param.analyse.i_noise_reduction;
+    h->mb.b_noise_reduction = !!h->param.analyse.i_noise_reduction;
     if( h->mb.b_trellis == 1 || h->mb.b_noise_reduction )
         h->mb.i_skip_intra = 0;
 }
diff --git a/encoder/macroblock.c b/encoder/macroblock.c
index 66d034cb..403b8587 100644
--- a/encoder/macroblock.c
+++ b/encoder/macroblock.c
@@ -443,11 +443,12 @@ void x264_macroblock_encode( x264_t *h )
             DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
+            h->nr_count[1] += h->mb.b_noise_reduction * 4;
 
             for( idx = 0; idx < 4; idx++ )
             {
                 if( h->mb.b_noise_reduction )
-                    x264_denoise_dct( h, (int16_t*)dct8x8[idx] );
+                    h->quantf.denoise_dct_core( *dct8x8[idx], h->nr_residual_sum[1], h->nr_offset[1], 64 );
                 if( h->mb.b_trellis )
                     x264_quant_8x8_trellis( h, dct8x8[idx], CQM_8PY, i_qp, 0 );
                 else
@@ -482,6 +483,7 @@ void x264_macroblock_encode( x264_t *h )
         {
             DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
+            h->nr_count[0] += h->mb.b_noise_reduction * 16;
 
             for( i8x8 = 0; i8x8 < 4; i8x8++ )
             {
@@ -494,7 +496,7 @@ void x264_macroblock_encode( x264_t *h )
                     idx = i8x8 * 4 + i4x4;
 
                     if( h->mb.b_noise_reduction )
-                        x264_denoise_dct( h, (int16_t*)dct4x4[idx] );
+                        h->quantf.denoise_dct_core( *dct4x4[idx], h->nr_residual_sum[0], h->nr_offset[0], 16 );
                     if( h->mb.b_trellis )
                         x264_quant_4x4_trellis( h, dct4x4[idx], CQM_4PY, i_qp, DCT_LUMA_4x4, 0 );
                     else
@@ -738,37 +740,6 @@ void x264_noise_reduction_update( x264_t *h )
     }
 }
 
-void x264_denoise_dct( x264_t *h, int16_t *dct )
-{
-    const int cat = h->mb.b_transform_8x8;
-    int i;
-
-    h->nr_count[cat]++;
-
-    for( i = (cat ? 63 : 15); i >= 1; i-- )
-    {
-        int level = dct[i];
-        if( level )
-        {
-            if( level > 0 )
-            {
-                h->nr_residual_sum[cat][i] += level;
-                level -= h->nr_offset[cat][i];
-                if( level < 0 )
-                    level = 0;
-            }
-            else
-            {
-                h->nr_residual_sum[cat][i] -= level;
-                level += h->nr_offset[cat][i];
-                if( level > 0 )
-                    level = 0;
-            }
-            dct[i] = level;
-        }
-    }
-}
-
 /*****************************************************************************
  * RD only; 4 calls to this do not make up for one macroblock_encode.
  * doesn't transform chroma dc.
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index ba7be690..d17a919a 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -55,7 +55,6 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
                              int i_qp, int b_intra );
 
 void x264_noise_reduction_update( x264_t *h );
-void x264_denoise_dct( x264_t *h, int16_t *dct );
 
 #endif
 
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 89798342..85fce259 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1023,6 +1023,27 @@ static int check_quant( int cpu_ref, int cpu_new )
     ok = oks[1]; used_asm = used_asms[1];
     report( "dequant :" );
 
+
+    if( qf_a.denoise_dct_core != qf_ref.denoise_dct_core )
+    {
+        int size;
+        for( size = 16; size <= 64; size += 48 )
+        {
+            set_func_name( "denoise_dct" );
+            used_asm = 1;
+            memcpy(dct1, buf1, size*2);
+            memcpy(dct2, buf1, size*2);
+            memcpy(buf3+256, buf3, 256);
+            call_c1( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
+            call_a1( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+            if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
+                ok = 0;
+            call_c2( qf_c.denoise_dct_core, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
+            call_a2( qf_a.denoise_dct_core, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
+        }
+    }
+    report( "denoise dct :" );
+
     return ret;
 }
 
-- 
2.40.0