From 64075c9b0129efcdba8c85d7519dbe385dbc56c5 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Fri, 10 Aug 2012 12:35:55 -0700
Subject: [PATCH] Encoder denoiser performance improvement

The denoiser function was modified to reduce the computational
complexity.

1. The denoiser c function modification:
The original implementation calculated pixel's filter_coefficient
based on the pixel value difference between current raw frame and last
denoised raw frame, and stored them in lookup tables. For each pixel c,
find its coefficient using
    filter_coefficient[c] = LUT[abs_diff[c]];
and then apply filtering operation for the pixel.

The denoising filter costed about 12% of encoding time when it was
turned on, and half of the time was spent on finding coefficients in
lookup tables. In order to simplify the process, a short cut was taken.
The pixel adjustments vs. pixel diff value were calculated ahead of time.
    adjustment = filtered_value - current_raw
               = (filter_coefficient * diff + 128) >> 8

The adjustment vs. diff curve becomes flat very quick when diff increases.
This allowed us to use only several levels to get a close approximation
of the curve. Following the denoiser algorithm, the adjustments are
further modified according to how big the motion magnitude is.

2. The sse2 function was rewritten.

This change made denoiser filter function 3x faster, and improved the
encoder performance by 7% ~ 10% with the denoiser on.

Change-Id: I93a4308963b8e80c7307f96ffa8b8c667425bf50
---
 vp8/encoder/denoising.c          | 183 +++++++++++++------------------
 vp8/encoder/denoising.h          |  10 +-
 vp8/encoder/x86/denoising_sse2.c | 155 ++++++++++----------------
 3 files changed, 136 insertions(+), 212 deletions(-)

diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index d6b03e63a..c0dd7c106 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -23,151 +23,118 @@ static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
 static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
 
 /*
- * The filtering coefficients used for denoizing are adjusted for static
- * blocks, or blocks with very small motion vectors. This is done through
- * the motion magnitude parameter.
+ * The filter function was modified to reduce the computational complexity.
+ * Step 1:
+ * Instead of applying tap coefficients for each pixel, we calculated the
+ * pixel adjustments vs. pixel diff value ahead of time.
+ *     adjustment = filtered_value - current_raw
+ *                = (filter_coefficient * diff + 128) >> 8
+ * where
+ *     filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+ *     filter_coefficient += filter_coefficient /
+ *                           (3 + motion_magnitude_adjustment);
+ *     filter_coefficient is clamped to 0 ~ 255.
  *
- * There are currently 2048 possible mapping from absolute difference to
- * filter coefficient depending on the motion magnitude. Each mapping is
- * in a LUT table. All these tables are staticly allocated but they are only
- * filled on their first use.
- *
- * Each entry is a pair of 16b values, the coefficient and its complement
- * to 256. Each of these value should only be 8b but they are 16b wide to
- * avoid slow partial register manipulations.
+ * Step 2:
+ * The adjustment vs. diff curve becomes flat very quick when diff increases.
+ * This allowed us to use only several levels to approximate the curve without
+ * changing the filtering algorithm too much.
+ * The adjustments were further corrected by checking the motion magnitude.
+ * The levels used are:
+ * diff       adjustment w/o motion correction   adjustment w/ motion correction
+ * [-255, -16]           -6                                   -7
+ * [-15, -8]             -4                                   -5
+ * [-7, -4]              -3                                   -4
+ * [-3, 3]               diff                                 diff
+ * [4, 7]                 3                                    4
+ * [8, 15]                4                                    5
+ * [16, 255]              6                                    7
  */
-enum {num_motion_magnitude_adjustments = 2048};
-
-static union coeff_pair filter_coeff_LUT[num_motion_magnitude_adjustments][256];
-static uint8_t filter_coeff_LUT_initialized[num_motion_magnitude_adjustments] =
-    { 0 };
-
-
-union coeff_pair *vp8_get_filter_coeff_LUT(unsigned int motion_magnitude)
-{
-    union coeff_pair *LUT;
-    unsigned int motion_magnitude_adjustment = motion_magnitude >> 3;
-
-    if (motion_magnitude_adjustment >= num_motion_magnitude_adjustments)
-    {
-        motion_magnitude_adjustment = num_motion_magnitude_adjustments - 1;
-    }
-
-    LUT = filter_coeff_LUT[motion_magnitude_adjustment];
-
-    if (!filter_coeff_LUT_initialized[motion_magnitude_adjustment])
-    {
-        int absdiff;
-
-        for (absdiff = 0; absdiff < 256; ++absdiff)
-        {
-            unsigned int filter_coefficient;
-            filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
-            filter_coefficient += filter_coefficient /
-                                  (3 + motion_magnitude_adjustment);
-
-            if (filter_coefficient > 255)
-            {
-                filter_coefficient = 255;
-            }
-
-            LUT[absdiff].as_short[0] = filter_coefficient ;
-            LUT[absdiff].as_short[1] = 256 - filter_coefficient;
-        }
-
-        filter_coeff_LUT_initialized[motion_magnitude_adjustment] = 1;
-    }
-
-    return LUT;
-}
-
-
 
 int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
-                          YV12_BUFFER_CONFIG *running_avg,
-                          MACROBLOCK *signal,
-                          unsigned int motion_magnitude,
-                          int y_offset,
+                          YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
+                          unsigned int motion_magnitude, int y_offset,
                           int uv_offset)
 {
-    unsigned char filtered_buf[16*16];
-    unsigned char *filtered = filtered_buf;
     unsigned char *sig = signal->thismb;
     int sig_stride = 16;
     unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
     int mc_avg_y_stride = mc_running_avg->y_stride;
     unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
     int avg_y_stride = running_avg->y_stride;
-    const union coeff_pair *LUT = vp8_get_filter_coeff_LUT(motion_magnitude);
-    int r, c;
+    int r, c, i;
     int sum_diff = 0;
+    int adj_val[3] = {3, 4, 6};
 
-    for (r = 0; r < 16; ++r)
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
     {
-        /* Calculate absolute differences */
-        unsigned char abs_diff[16];
-
-        union coeff_pair filter_coefficient[16];
-
-        for (c = 0; c < 16; ++c)
-        {
-            int absdiff = sig[c] - mc_running_avg_y[c];
-            absdiff = absdiff > 0 ? absdiff : -absdiff;
-            abs_diff[c] = absdiff;
-        }
-
-        /* Use LUT to get filter coefficients (two 16b value; f and 256-f) */
-        for (c = 0; c < 16; ++c)
-        {
-            filter_coefficient[c] = LUT[abs_diff[c]];
-        }
+        for (i = 0; i < 3; i++)
+            adj_val[i] += 1;
+    }
 
-        /* Filtering... */
+    for (r = 0; r < 16; ++r)
+    {
         for (c = 0; c < 16; ++c)
         {
-            const uint16_t state = (uint16_t)(mc_running_avg_y[c]);
-            const uint16_t sample = (uint16_t)(sig[c]);
-
-            running_avg_y[c] = (filter_coefficient[c].as_short[0] * state +
-                    filter_coefficient[c].as_short[1] * sample + 128) >> 8;
-        }
+            int diff = 0;
+            int adjustment = 0;
+            int absdiff = 0;
 
-        /* Depending on the magnitude of the difference between the signal and
-         * filtered version, either replace the signal by the filtered one or
-         * update the filter state with the signal when the change in a pixel
-         * isn't classified as noise.
-         */
-        for (c = 0; c < 16; ++c)
-        {
-            const int diff = sig[c] - running_avg_y[c];
-            sum_diff += diff;
+            diff = mc_running_avg_y[c] - sig[c];
+            absdiff = abs(diff);
 
-            if (diff * diff < NOISE_DIFF2_THRESHOLD)
+            /* When |diff| < 4, use pixel value from last denoised raw. */
+            if (absdiff <= 3)
             {
-                filtered[c] = running_avg_y[c];
+                running_avg_y[c] = mc_running_avg_y[c];
+                sum_diff += diff;
             }
             else
             {
-                filtered[c] = sig[c];
-                running_avg_y[c] = sig[c];
+                if (absdiff >= 4 && absdiff <= 7)
+                    adjustment = adj_val[0];
+                else if (absdiff >= 8 && absdiff <= 15)
+                    adjustment = adj_val[1];
+                else
+                    adjustment = adj_val[2];
+
+                if (diff > 0)
+                {
+                    if ((sig[c] + adjustment) > 255)
+                        running_avg_y[c] = 255;
+                    else
+                        running_avg_y[c] = sig[c] + adjustment;
+
+                    sum_diff += adjustment;
+                }
+                else
+                {
+                    if ((sig[c] - adjustment) < 0)
+                        running_avg_y[c] = 0;
+                    else
+                        running_avg_y[c] = sig[c] - adjustment;
+
+                    sum_diff -= adjustment;
+                }
             }
         }
 
         /* Update pointers for next iteration. */
         sig += sig_stride;
-        filtered += 16;
         mc_running_avg_y += mc_avg_y_stride;
         running_avg_y += avg_y_stride;
     }
+
     if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
-    {
         return COPY_BLOCK;
-    }
-    vp8_copy_mem16x16(filtered_buf, 16, signal->thismb, sig_stride);
+
+    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
+                      signal->thismb, sig_stride);
     return FILTER_BLOCK;
 }
 
-
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
 {
     int i;
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index 2f5fbff70..b025f5cdf 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -13,8 +13,8 @@
 
 #include "block.h"
 
-#define NOISE_DIFF2_THRESHOLD (75)
 #define SUM_DIFF_THRESHOLD (16 * 16 * 2)
+#define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 enum vp8_denoiser_decision
 {
@@ -39,12 +39,4 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              int recon_yoffset,
                              int recon_uvoffset);
 
-union coeff_pair
-{
-    uint32_t as_int;
-    uint16_t as_short[2];
-};
-
-union coeff_pair *vp8_get_filter_coeff_LUT(unsigned int motion_magnitude);
-
 #endif  /* VP8_ENCODER_DENOISING_H_ */
diff --git a/vp8/encoder/x86/denoising_sse2.c b/vp8/encoder/x86/denoising_sse2.c
index fbce8d13c..c1ac6c137 100644
--- a/vp8/encoder/x86/denoising_sse2.c
+++ b/vp8/encoder/x86/denoising_sse2.c
@@ -9,7 +9,6 @@
  */
 
 #include "vp8/encoder/denoising.h"
-
 #include "vp8/common/reconinter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
@@ -19,7 +18,7 @@
 
 union sum_union {
     __m128i v;
-    short e[8];
+    signed char e[16];
 };
 
 int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
@@ -27,128 +26,94 @@ int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
                              MACROBLOCK *signal, unsigned int motion_magnitude,
                              int y_offset, int uv_offset)
 {
-    unsigned char filtered_buf[16*16];
-    unsigned char *filtered = filtered_buf;
     unsigned char *sig = signal->thismb;
     int sig_stride = 16;
     unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
     int mc_avg_y_stride = mc_running_avg->y_stride;
     unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
     int avg_y_stride = running_avg->y_stride;
-    const union coeff_pair *LUT = vp8_get_filter_coeff_LUT(motion_magnitude);
-    int r, c;
-    __m128i acc_diff = { 0 };
+    int r;
+    __m128i acc_diff = _mm_setzero_si128();
+    const __m128i k_0 = _mm_setzero_si128();
+    const __m128i k_4 = _mm_set1_epi8(4);
+    const __m128i k_8 = _mm_set1_epi8(8);
+    const __m128i k_16 = _mm_set1_epi8(16);
+    /* Modify each level's adjustment according to motion_magnitude. */
+    const __m128i l3 = _mm_set1_epi8(
+                      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
+    /* Difference between level 3 and level 2 is 2. */
+    const __m128i l32 = _mm_set1_epi8(2);
+    /* Difference between level 2 and level 1 is 1. */
+    const __m128i l21 = _mm_set1_epi8(1);
 
     for (r = 0; r < 16; ++r)
     {
-        __m128i filter_coefficient_00, filter_coefficient_04;
-        __m128i filter_coefficient_08, filter_coefficient_12;
-        __m128i v_sig0, v_sig1;
-        __m128i v_mc_running_avg_y0, v_mc_running_avg_y1;
-        __m128i state0, state1, state2, state3;
-        __m128i res0, res1, res2, res3;
+        /* Calculate differences */
+        const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
+        const __m128i v_mc_running_avg_y = _mm_loadu_si128(
+                                           (__m128i *)(&mc_running_avg_y[0]));
         __m128i v_running_avg_y;
-        __m128i diff0, diff1, diff0sq, diff1sq, diff_sq;
-        const __m128i kNOISE_DIFF2_THRESHOLD =
-                _mm_set1_epi8(NOISE_DIFF2_THRESHOLD);
-        __m128i take_running, p0, p1, p2;
-        const __m128i k_zero = _mm_set1_epi16(0);
-        const __m128i k_128 = _mm_set1_epi32(128);
-
-        /* Calculate absolute differences */
-        DECLARE_ALIGNED_ARRAY(16,unsigned char,abs_diff,16);
-        DECLARE_ALIGNED_ARRAY(16,uint32_t,filter_coefficient,16);
-        __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
-        __m128i v_mc_running_avg_y = _mm_loadu_si128(
-                                         (__m128i *)(&mc_running_avg_y[0]));
-        __m128i a_minus_b = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
-        __m128i b_minus_a = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
-        __m128i v_abs_diff = _mm_adds_epu8(a_minus_b, b_minus_a);
-        _mm_store_si128((__m128i *)(&abs_diff[0]), v_abs_diff);
+        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
+        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
+        /* Obtain the sign. FF if diff is negative. */
+        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
+        /* Clamp absolute difference to 16 to be used to get mask. Doing this
+         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
+        const __m128i clamped_absdiff = _mm_min_epu8(
+                                        _mm_or_si128(pdiff, ndiff), k_16);
+        /* Get masks for l2 l1 and l0 adjustments */
+        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
+        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
+        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
+        /* Get adjustments for l2, l1, and l0 */
+        __m128i adj2 = _mm_and_si128(mask2, l32);
+        const __m128i adj1 = _mm_and_si128(mask1, l21);
+        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
+        __m128i adj,  padj, nadj;
 
-        /* Use LUT to get filter coefficients (two 16b value; f and 256-f) */
-        for (c = 0; c < 16; ++c)
-        {
-            filter_coefficient[c] = LUT[abs_diff[c]].as_int;
-        }
+        /* Combine the adjustments and get absolute adjustments. */
+        adj2 = _mm_add_epi8(adj2, adj1);
+        adj = _mm_sub_epi8(l3, adj2);
+        adj = _mm_andnot_si128(mask0, adj);
+        adj = _mm_or_si128(adj, adj0);
 
-        /* Filtering... */
-        /* load filter coefficients (two 16b value; f and 256-f) */
-        filter_coefficient_00 = _mm_load_si128(
-                (__m128i *)(&filter_coefficient[ 0]));
-        filter_coefficient_04 = _mm_load_si128(
-                (__m128i *)(&filter_coefficient[ 4]));
-        filter_coefficient_08 = _mm_load_si128(
-                (__m128i *)(&filter_coefficient[ 8]));
-        filter_coefficient_12 = _mm_load_si128(
-                (__m128i *)(&filter_coefficient[12]));
+        /* Restore the sign and get positive and negative adjustments. */
+        padj = _mm_andnot_si128(diff_sign, adj);
+        nadj = _mm_and_si128(diff_sign, adj);
 
-        /* expand sig from 8b to 16b */
-        v_sig0 = _mm_unpacklo_epi8(v_sig, k_zero);
-        v_sig1 = _mm_unpackhi_epi8(v_sig, k_zero);
-        /* expand mc_running_avg_y from 8b to 16b */
-        v_mc_running_avg_y0 = _mm_unpacklo_epi8(v_mc_running_avg_y, k_zero);
-        v_mc_running_avg_y1 = _mm_unpackhi_epi8(v_mc_running_avg_y, k_zero);
-        /* interleave sig and mc_running_avg_y for upcoming multiply-add */
-        state0 = _mm_unpacklo_epi16(v_mc_running_avg_y0, v_sig0);
-        state1 = _mm_unpackhi_epi16(v_mc_running_avg_y0, v_sig0);
-        state2 = _mm_unpacklo_epi16(v_mc_running_avg_y1, v_sig1);
-        state3 = _mm_unpackhi_epi16(v_mc_running_avg_y1, v_sig1);
-        /* blend values */
-        res0 = _mm_madd_epi16(filter_coefficient_00, state0);
-        res1 = _mm_madd_epi16(filter_coefficient_04, state1);
-        res2 = _mm_madd_epi16(filter_coefficient_08, state2);
-        res3 = _mm_madd_epi16(filter_coefficient_12, state3);
-        res0 = _mm_add_epi32(res0, k_128);
-        res1 = _mm_add_epi32(res1, k_128);
-        res2 = _mm_add_epi32(res2, k_128);
-        res3 = _mm_add_epi32(res3, k_128);
-        res0 = _mm_srai_epi32(res0, 8);
-        res1 = _mm_srai_epi32(res1, 8);
-        res2 = _mm_srai_epi32(res2, 8);
-        res3 = _mm_srai_epi32(res3, 8);
-        /* combine the 32b results into a single 8b vector */
-        res0 = _mm_packs_epi32(res0, res1);
-        res2 = _mm_packs_epi32(res2, res3);
-        v_running_avg_y = _mm_packus_epi16(res0, res2);
+        /* Calculate filtered value. */
+        v_running_avg_y = _mm_adds_epu8(v_sig, padj);
+        v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
+        _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
 
-        /* Depending on the magnitude of the difference between the signal and
-         * filtered version, either replace the signal by the filtered one or
-         * update the filter state with the signal when the change in a pixel
-         * isn't classified as noise.
+        /* Adjustments <=7, and each element in acc_diff can fit in signed
+         * char.
          */
-        diff0 = _mm_sub_epi16(v_sig0, res0);
-        diff1 = _mm_sub_epi16(v_sig1, res2);
-        acc_diff = _mm_add_epi16(acc_diff, _mm_add_epi16(diff0, diff1));
-
-        diff0sq = _mm_mullo_epi16(diff0, diff0);
-        diff1sq = _mm_mullo_epi16(diff1, diff1);
-        diff_sq = _mm_packus_epi16(diff0sq, diff1sq);
-        take_running = _mm_cmplt_epi8(diff_sq, kNOISE_DIFF2_THRESHOLD);
-        p0 = _mm_and_si128(take_running, v_running_avg_y);
-        p1 = _mm_andnot_si128(take_running, v_sig);
-        p2 = _mm_or_si128(p0, p1);
-        _mm_storeu_si128((__m128i *)(&running_avg_y[0]), p2);
-        _mm_storeu_si128((__m128i *)(&filtered[0]), p2);
+        acc_diff = _mm_adds_epi8(acc_diff, padj);
+        acc_diff = _mm_subs_epi8(acc_diff, nadj);
 
         /* Update pointers for next iteration. */
         sig += sig_stride;
-        filtered += 16;
         mc_running_avg_y += mc_avg_y_stride;
         running_avg_y += avg_y_stride;
     }
+
     {
         /* Compute the sum of all pixel differences of this MB. */
         union sum_union s;
-        int sum_diff;
+        int sum_diff = 0;
         s.v = acc_diff;
-        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] +
-          s.e[4] + s.e[5] + s.e[6] + s.e[7];
+        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
+                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
+                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];
+
         if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
         {
             return COPY_BLOCK;
         }
     }
-    vp8_copy_mem16x16(filtered_buf, 16, signal->thismb, sig_stride);
+
+    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
+                      signal->thismb, sig_stride);
     return FILTER_BLOCK;
 }
-- 
2.50.1