]> granicus.if.org Git - handbrake/commitdiff
libhb: Minor nlmeans optimizations.
authorbradleys <bradley@bradleysepos.com>
Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)
committerbradleys <bradley@bradleysepos.com>
Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)
Use calloc for nlmeans integral instead of memsets in-loop zeroing.
Replace superfluous const with literal in SSE implementation.
Move exponential table calculation out of the main loop.
More const correctness.
Add some braces.

Overall, slightly more readable/maintainable and (very) slightly faster.

git-svn-id: svn://svn.handbrake.fr/HandBrake/trunk@6874 b64f7644-9d1e-0410-96f1-a4d463321fa5

libhb/nlmeans.c
libhb/nlmeans_x86.c

index 78422fa1f89f1b4bcbe24947cd310f254d9d328e..25389ba77850dbdf94657c90d7328157ff275845 100644 (file)
@@ -131,6 +131,10 @@ struct hb_filter_private_s
     int    nframes[3];     // temporal search depth in frames
     int    prefilter[3];   // prefilter mode, can improve weight analysis
 
+    float  exptable[3][NLMEANS_EXPSIZE];
+    float  weight_fact_table[3];
+    int    diff_max[3];
+
     NLMeansFunctions functions;
 
     Frame      *frame;
@@ -166,7 +170,7 @@ static void nlmeans_border(uint8_t *src,
                            int h,
                            int border)
 {
-    int bw = w + 2 * border;
+    const int bw = w + 2 * border;
     uint8_t *image = src + border + bw * border;
 
     // Create faux borders using edge pixels
@@ -192,11 +196,14 @@ static void nlmeans_deborder(BorderedPlane *src,
                              int s,
                              int h)
 {
-    int bw = src->w + 2 * src->border;
+    const int bw = src->w + 2 * src->border;
     uint8_t *image = src->mem + src->border + bw * src->border;
+
     int width = w;
     if (src->w < width)
+    {
         width = src->w;
+    }
 
     // Copy main image
     for (int y = 0; y < h; y++)
@@ -213,8 +220,8 @@ static void nlmeans_alloc(uint8_t *src,
                           BorderedPlane *dst,
                           int border)
 {
-    int bw = src_w + 2 * border;
-    int bh = src_h + 2 * border;
+    const int bw = src_w + 2 * border;
+    const int bh = src_h + 2 * border;
 
     uint8_t *mem   = malloc(bw * bh * sizeof(uint8_t));
     uint8_t *image = mem + border + bw * border;
@@ -246,11 +253,11 @@ static void nlmeans_filter_mean(uint8_t *src,
 {
 
     // Mean filter
-    int bw = w + 2 * border;
-    int offset_min = -((size - 1) /2);
-    int offset_max =   (size + 1) /2;
+    const int bw = w + 2 * border;
+    const int offset_min = -((size - 1) /2);
+    const int offset_max =   (size + 1) /2;
+    const double pixel_weight = 1.0 / (size * size);
     uint16_t pixel_sum;
-    double pixel_weight = 1.0 / (size * size);
     for (int y = 0; y < h; y++)
     {
         for (int x = 0; x < w; x++)
@@ -341,9 +348,9 @@ static void nlmeans_filter_median(uint8_t *src,
                                   int size)
 {
     // Median filter
-    int bw = w + 2 * border;
-    int offset_min = -((size - 1) /2);
-    int offset_max =   (size + 1) /2;
+    const int bw = w + 2 * border;
+    const int offset_min = -((size - 1) /2);
+    const int offset_max =   (size + 1) /2;
     int index;
     uint8_t pixels[size * size];
     for (int y = 0; y < h; y++)
@@ -371,19 +378,19 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
                                      int h,
                                      int border)
 {
-    int bw = w + 2 * border;
-    int bh = h + 2 * border;
+    const int bw = w + 2 * border;
+    const int bh = h + 2 * border;
 
     // Custom kernel
-    int kernel_size = 3;
-    int kernel[3][3] = {{-31, 0, 31},
-                        {-44, 0, 44},
-                        {-31, 0, 31}};
-    double kernel_coef = 1.0 / 126.42;
+    const int kernel_size = 3;
+    const int kernel[3][3] = {{-31, 0, 31},
+                              {-44, 0, 44},
+                              {-31, 0, 31}};
+    const double kernel_coef = 1.0 / 126.42;
 
     // Detect edges
-    int offset_min = -((kernel_size - 1) /2);
-    int offset_max =   (kernel_size + 1) /2;
+    const int offset_min = -((kernel_size - 1) /2);
+    const int offset_max =   (kernel_size + 1) /2;
     uint16_t pixel1;
     uint16_t pixel2;
     uint8_t *mask_mem = calloc(bw * bh, sizeof(uint8_t));
@@ -487,11 +494,11 @@ static void nlmeans_prefilter(BorderedPlane *src,
         // Source image
         uint8_t *mem   = src->mem;
         uint8_t *image = src->image;
-        int border     = src->border;
-        int w          = src->w;
-        int h          = src->h;
-        int bw         = w + 2 * border;
-        int bh         = h + 2 * border;
+        const int border     = src->border;
+        const int w          = src->w;
+        const int h          = src->h;
+        const int bw         = w + 2 * border;
+        const int bh         = h + 2 * border;
 
         // Duplicate plane
         uint8_t *mem_pre = malloc(bw * bh * sizeof(uint8_t));
@@ -584,14 +591,11 @@ static void build_integral_scalar(uint32_t *integral,
                                   int       dx,
                                   int       dy)
 {
-    memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
     for (int y = 0; y < h; y++)
     {
         const uint8_t *p1 = src_pre + y*src_w;
         const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
-        uint32_t *out = integral + (y*integral_stride) - 1;
-
-        *out++ = 0;
+        uint32_t *out = integral + (y*integral_stride);
 
         for (int x = 0; x < w; x++)
         {
@@ -625,38 +629,28 @@ static void nlmeans_plane(NLMeansFunctions *functions,
                           double h_param,
                           double origin_tune,
                           int n,
-                          int r)
+                          int r,
+                    const float *exptable,
+                    const float  weight_fact_table,
+                    const int    diff_max)
 {
-    int n_half = (n-1) /2;
-    int r_half = (r-1) /2;
+    const int n_half = (n-1) /2;
+    const int r_half = (r-1) /2;
 
     // Source image
     uint8_t *src     = frame[0].plane[plane].image;
     uint8_t *src_pre = frame[0].plane[plane].image_pre;
-    int border       = frame[0].plane[plane].border;
-    int src_w        = frame[0].plane[plane].w + 2 * border;
+    const int border = frame[0].plane[plane].border;
+    const int src_w  = frame[0].plane[plane].w + 2 * border;
 
     // Allocate temporary pixel sums
     struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum));
 
     // Allocate integral image
-    int integral_stride = w + 2 * 16;
-    uint32_t *integral_mem = malloc(integral_stride * (h+1) * sizeof(uint32_t));
+    const int integral_stride = w + 2 * 16;
+    uint32_t *integral_mem = calloc(integral_stride * (h+1), sizeof(uint32_t));
     uint32_t *integral     = integral_mem + integral_stride + 16;
 
-    // Precompute exponential table
-    float exptable[NLMEANS_EXPSIZE];
-    const float weight_factor       = 1.0/n/n / (h_param * h_param);
-    const float min_weight_in_table = 0.0005;
-    const float stretch             = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
-    const float weight_fact_table   = weight_factor * stretch;
-    const int   diff_max            = NLMEANS_EXPSIZE / weight_fact_table;
-    for (int i = 0; i < NLMEANS_EXPSIZE; i++)
-    {
-        exptable[i] = exp(-i/stretch);
-    }
-    exptable[NLMEANS_EXPSIZE-1] = 0;
-
     // Iterate through available frames
     for (int f = 0; f < nframes; f++)
     {
@@ -665,8 +659,8 @@ static void nlmeans_plane(NLMeansFunctions *functions,
         // Compare image
         uint8_t *compare     = frame[f].plane[plane].image;
         uint8_t *compare_pre = frame[f].plane[plane].image_pre;
-        int border           = frame[f].plane[plane].border;
-        int compare_w        = frame[f].plane[plane].w + 2 * border;
+        const int border     = frame[f].plane[plane].border;
+        const int compare_w  = frame[f].plane[plane].w + 2 * border;
 
         // Iterate through all displacements
         for (int dy = -r_half; dy <= r_half; dy++)
@@ -712,19 +706,19 @@ static void nlmeans_plane(NLMeansFunctions *functions,
 
                     for (int x = 0; x <= w-n; x++)
                     {
-                        int xc = x + n_half;
-                        int yc = y + n_half;
+                        const int xc = x + n_half;
+                        const int yc = y + n_half;
 
                         // Difference between patches
-                        int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
+                        const int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
 
                         // Sum pixel with weight
                         if (diff < diff_max)
                         {
-                            int diffidx = diff * weight_fact_table;
+                            const int diffidx = diff * weight_fact_table;
 
                             //float weight = exp(-diff*weightFact);
-                            float weight = exptable[diffidx];
+                            const float weight = exptable[diffidx];
 
                             tmp_data[yc*w + xc].weight_sum += weight;
                             tmp_data[yc*w + xc].pixel_sum  += weight * compare[(yc+dy)*compare_w + xc + dx];
@@ -837,6 +831,21 @@ static int nlmeans_init(hb_filter_object_t *filter,
         if (pv->prefilter[c] < 0)       { pv->prefilter[c] = 0; }
 
         if (pv->max_frames < pv->nframes[c]) pv->max_frames = pv->nframes[c];
+
+        // Precompute exponential table
+        float *exptable = &pv->exptable[c][0];
+        float *weight_fact_table = &pv->weight_fact_table[c];
+        int   *diff_max = &pv->diff_max[c];
+        const float weight_factor        = 1.0/pv->patch_size[c]/pv->patch_size[c] / (pv->strength[c] * pv->strength[c]);
+        const float min_weight_in_table  = 0.0005;
+        const float stretch              = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
+        *(weight_fact_table)             = weight_factor * stretch;
+        *(diff_max)                      = NLMEANS_EXPSIZE / *(weight_fact_table);
+        for (int i = 0; i < NLMEANS_EXPSIZE; i++)
+        {
+            exptable[i] = exp(-i/stretch);
+        }
+        exptable[NLMEANS_EXPSIZE-1] = 0;
     }
 
     pv->thread_count = hb_get_cpu_count();
@@ -981,7 +990,10 @@ static void nlmeans_filter_thread(void *thread_args_v)
                           pv->strength[c],
                           pv->origin_tune[c],
                           pv->patch_size[c],
-                          pv->range[c]);
+                          pv->range[c],
+                          pv->exptable[c],
+                          pv->weight_fact_table[c],
+                          pv->diff_max[c]);
         }
         buf->s = pv->frame[segment].s;
         thread_data->out = buf;
@@ -1015,7 +1027,9 @@ static void nlmeans_add_frame(hb_filter_private_t *pv, hb_buffer_t *buf)
 static hb_buffer_t * nlmeans_filter(hb_filter_private_t *pv)
 {
     if (pv->next_frame < pv->max_frames + pv->thread_count)
+    {
         return NULL;
+    }
 
     taskset_cycle(&pv->taskset);
 
@@ -1102,7 +1116,9 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
 
             int nframes = pv->next_frame - f;
             if (pv->nframes[c] < nframes)
+            {
                 nframes = pv->nframes[c];
+            }
             // Process current plane
             nlmeans_plane(functions,
                           frame,
@@ -1116,7 +1132,10 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
                           pv->strength[c],
                           pv->origin_tune[c],
                           pv->patch_size[c],
-                          pv->range[c]);
+                          pv->range[c],
+                          pv->exptable[c],
+                          pv->weight_fact_table[c],
+                          pv->diff_max[c]);
         }
         buf->s = frame->s;
         if (out == NULL)
index 9acba22d402faed929ce7d52965239f40fefef52..685ac857e606770e4b06a497bd825010150ceeb0 100644 (file)
@@ -29,21 +29,15 @@ static void build_integral_sse2(uint32_t *integral,
 {
     const __m128i zero = _mm_set1_epi8(0);
 
-    memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
-
     for (int y = 0; y < h; y++)
     {
         __m128i prevadd = _mm_set1_epi32(0);
 
         const uint8_t *p1 = src_pre + y*src_w;
         const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
-        uint32_t *out = integral + (y*integral_stride) - 1;
-
-        *out++ = 0;
-
-        const int pixels_step = 16;
+        uint32_t *out = integral + (y*integral_stride);
 
-        for (int x = 0; x < w; x += pixels_step)
+        for (int x = 0; x < w; x += 16)
         {
             __m128i pa, pb;
             __m128i pla, plb;
@@ -116,16 +110,16 @@ static void build_integral_sse2(uint32_t *integral,
             _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory
 
             // Increment
-            out += pixels_step;
-            p1  += pixels_step;
-            p2  += pixels_step;
+            out += 16;
+            p1  += 16;
+            p2  += 16;
         }
 
         if (y > 0)
         {
             out = integral + y*integral_stride;
 
-            for (int x = 0; x < w; x += pixels_step)
+            for (int x = 0; x < w; x += 16)
             {
                 *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
                                                  *(__m128i*)(out));
@@ -139,7 +133,7 @@ static void build_integral_sse2(uint32_t *integral,
                 *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
                                                       *(__m128i*)(out+12));
 
-                out += 4*4;
+                out += 16;
             }
         }
     }