libhb: Minor nlmeans optimizations.

author bradleys <bradley@bradleysepos.com>

Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)

committer bradleys <bradley@bradleysepos.com>

Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)
author bradleys <bradley@bradleysepos.com>
Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)
committer bradleys <bradley@bradleysepos.com>
Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)
diff --git a/libhb/nlmeans.c b/libhb/nlmeans.c

index 78422fa1f89f1b4bcbe24947cd310f254d9d328e..25389ba77850dbdf94657c90d7328157ff275845 100644 (file)
--- a/libhb/nlmeans.c
+++ b/libhb/nlmeans.c
@@ -131,6 +131,10 @@ struct hb_filter_private_s
      int    nframes[3];     // temporal search depth in frames
      int    prefilter[3];   // prefilter mode, can improve weight analysis
  
+    float  exptable[3][NLMEANS_EXPSIZE];
+    float  weight_fact_table[3];
+    int    diff_max[3];
+
      NLMeansFunctions functions;
  
      Frame      *frame;
@@ -166,7 +170,7 @@ static void nlmeans_border(uint8_t *src,
                             int h,
                             int border)
  {
-    int bw = w + 2 * border;
+    const int bw = w + 2 * border;
      uint8_t *image = src + border + bw * border;
  
      // Create faux borders using edge pixels
@@ -192,11 +196,14 @@ static void nlmeans_deborder(BorderedPlane *src,
                               int s,
                               int h)
  {
-    int bw = src->w + 2 * src->border;
+    const int bw = src->w + 2 * src->border;
      uint8_t *image = src->mem + src->border + bw * src->border;
+
      int width = w;
      if (src->w < width)
+    {
          width = src->w;
+    }
  
      // Copy main image
      for (int y = 0; y < h; y++)
@@ -213,8 +220,8 @@ static void nlmeans_alloc(uint8_t *src,
                            BorderedPlane *dst,
                            int border)
  {
-    int bw = src_w + 2 * border;
-    int bh = src_h + 2 * border;
+    const int bw = src_w + 2 * border;
+    const int bh = src_h + 2 * border;
  
      uint8_t *mem   = malloc(bw * bh * sizeof(uint8_t));
      uint8_t *image = mem + border + bw * border;
@@ -246,11 +253,11 @@ static void nlmeans_filter_mean(uint8_t *src,
  {
  
      // Mean filter
-    int bw = w + 2 * border;
-    int offset_min = -((size - 1) /2);
-    int offset_max =   (size + 1) /2;
+    const int bw = w + 2 * border;
+    const int offset_min = -((size - 1) /2);
+    const int offset_max =   (size + 1) /2;
+    const double pixel_weight = 1.0 / (size * size);
      uint16_t pixel_sum;
-    double pixel_weight = 1.0 / (size * size);
      for (int y = 0; y < h; y++)
      {
          for (int x = 0; x < w; x++)
@@ -341,9 +348,9 @@ static void nlmeans_filter_median(uint8_t *src,
                                    int size)
  {
      // Median filter
-    int bw = w + 2 * border;
-    int offset_min = -((size - 1) /2);
-    int offset_max =   (size + 1) /2;
+    const int bw = w + 2 * border;
+    const int offset_min = -((size - 1) /2);
+    const int offset_max =   (size + 1) /2;
      int index;
      uint8_t pixels[size * size];
      for (int y = 0; y < h; y++)
@@ -371,19 +378,19 @@ static void nlmeans_filter_edgeboost(uint8_t *src,
                                       int h,
                                       int border)
  {
-    int bw = w + 2 * border;
-    int bh = h + 2 * border;
+    const int bw = w + 2 * border;
+    const int bh = h + 2 * border;
  
      // Custom kernel
-    int kernel_size = 3;
-    int kernel[3][3] = {{-31, 0, 31},
-                        {-44, 0, 44},
-                        {-31, 0, 31}};
-    double kernel_coef = 1.0 / 126.42;
+    const int kernel_size = 3;
+    const int kernel[3][3] = {{-31, 0, 31},
+                              {-44, 0, 44},
+                              {-31, 0, 31}};
+    const double kernel_coef = 1.0 / 126.42;
  
      // Detect edges
-    int offset_min = -((kernel_size - 1) /2);
-    int offset_max =   (kernel_size + 1) /2;
+    const int offset_min = -((kernel_size - 1) /2);
+    const int offset_max =   (kernel_size + 1) /2;
      uint16_t pixel1;
      uint16_t pixel2;
      uint8_t *mask_mem = calloc(bw * bh, sizeof(uint8_t));
@@ -487,11 +494,11 @@ static void nlmeans_prefilter(BorderedPlane *src,
          // Source image
          uint8_t *mem   = src->mem;
          uint8_t *image = src->image;
-        int border     = src->border;
-        int w          = src->w;
-        int h          = src->h;
-        int bw         = w + 2 * border;
-        int bh         = h + 2 * border;
+        const int border     = src->border;
+        const int w          = src->w;
+        const int h          = src->h;
+        const int bw         = w + 2 * border;
+        const int bh         = h + 2 * border;
  
          // Duplicate plane
          uint8_t *mem_pre = malloc(bw * bh * sizeof(uint8_t));
@@ -584,14 +591,11 @@ static void build_integral_scalar(uint32_t *integral,
                                    int       dx,
                                    int       dy)
  {
-    memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
      for (int y = 0; y < h; y++)
      {
          const uint8_t *p1 = src_pre + y*src_w;
          const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
-        uint32_t *out = integral + (y*integral_stride) - 1;
-
-        *out++ = 0;
+        uint32_t *out = integral + (y*integral_stride);
  
          for (int x = 0; x < w; x++)
          {
@@ -625,38 +629,28 @@ static void nlmeans_plane(NLMeansFunctions *functions,
                            double h_param,
                            double origin_tune,
                            int n,
-                          int r)
+                          int r,
+                    const float *exptable,
+                    const float  weight_fact_table,
+                    const int    diff_max)
  {
-    int n_half = (n-1) /2;
-    int r_half = (r-1) /2;
+    const int n_half = (n-1) /2;
+    const int r_half = (r-1) /2;
  
      // Source image
      uint8_t *src     = frame[0].plane[plane].image;
      uint8_t *src_pre = frame[0].plane[plane].image_pre;
-    int border       = frame[0].plane[plane].border;
-    int src_w        = frame[0].plane[plane].w + 2 * border;
+    const int border = frame[0].plane[plane].border;
+    const int src_w  = frame[0].plane[plane].w + 2 * border;
  
      // Allocate temporary pixel sums
      struct PixelSum *tmp_data = calloc(w * h, sizeof(struct PixelSum));
  
      // Allocate integral image
-    int integral_stride = w + 2 * 16;
-    uint32_t *integral_mem = malloc(integral_stride * (h+1) * sizeof(uint32_t));
+    const int integral_stride = w + 2 * 16;
+    uint32_t *integral_mem = calloc(integral_stride * (h+1), sizeof(uint32_t));
      uint32_t *integral     = integral_mem + integral_stride + 16;
  
-    // Precompute exponential table
-    float exptable[NLMEANS_EXPSIZE];
-    const float weight_factor       = 1.0/n/n / (h_param * h_param);
-    const float min_weight_in_table = 0.0005;
-    const float stretch             = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
-    const float weight_fact_table   = weight_factor * stretch;
-    const int   diff_max            = NLMEANS_EXPSIZE / weight_fact_table;
-    for (int i = 0; i < NLMEANS_EXPSIZE; i++)
-    {
-        exptable[i] = exp(-i/stretch);
-    }
-    exptable[NLMEANS_EXPSIZE-1] = 0;
-
      // Iterate through available frames
      for (int f = 0; f < nframes; f++)
      {
@@ -665,8 +659,8 @@ static void nlmeans_plane(NLMeansFunctions *functions,
          // Compare image
          uint8_t *compare     = frame[f].plane[plane].image;
          uint8_t *compare_pre = frame[f].plane[plane].image_pre;
-        int border           = frame[f].plane[plane].border;
-        int compare_w        = frame[f].plane[plane].w + 2 * border;
+        const int border     = frame[f].plane[plane].border;
+        const int compare_w  = frame[f].plane[plane].w + 2 * border;
  
          // Iterate through all displacements
          for (int dy = -r_half; dy <= r_half; dy++)
@@ -712,19 +706,19 @@ static void nlmeans_plane(NLMeansFunctions *functions,
  
                      for (int x = 0; x <= w-n; x++)
                      {
-                        int xc = x + n_half;
-                        int yc = y + n_half;
+                        const int xc = x + n_half;
+                        const int yc = y + n_half;
  
                          // Difference between patches
-                        int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
+                        const int diff = (uint32_t)(integral_ptr2[n] - integral_ptr2[0] - integral_ptr1[n] + integral_ptr1[0]);
  
                          // Sum pixel with weight
                          if (diff < diff_max)
                          {
-                            int diffidx = diff * weight_fact_table;
+                            const int diffidx = diff * weight_fact_table;
  
                              //float weight = exp(-diff*weightFact);
-                            float weight = exptable[diffidx];
+                            const float weight = exptable[diffidx];
  
                              tmp_data[yc*w + xc].weight_sum += weight;
                              tmp_data[yc*w + xc].pixel_sum  += weight * compare[(yc+dy)*compare_w + xc + dx];
@@ -837,6 +831,21 @@ static int nlmeans_init(hb_filter_object_t *filter,
          if (pv->prefilter[c] < 0)       { pv->prefilter[c] = 0; }
  
          if (pv->max_frames < pv->nframes[c]) pv->max_frames = pv->nframes[c];
+
+        // Precompute exponential table
+        float *exptable = &pv->exptable[c][0];
+        float *weight_fact_table = &pv->weight_fact_table[c];
+        int   *diff_max = &pv->diff_max[c];
+        const float weight_factor        = 1.0/pv->patch_size[c]/pv->patch_size[c] / (pv->strength[c] * pv->strength[c]);
+        const float min_weight_in_table  = 0.0005;
+        const float stretch              = NLMEANS_EXPSIZE / (-log(min_weight_in_table));
+        *(weight_fact_table)             = weight_factor * stretch;
+        *(diff_max)                      = NLMEANS_EXPSIZE / *(weight_fact_table);
+        for (int i = 0; i < NLMEANS_EXPSIZE; i++)
+        {
+            exptable[i] = exp(-i/stretch);
+        }
+        exptable[NLMEANS_EXPSIZE-1] = 0;
      }
  
      pv->thread_count = hb_get_cpu_count();
@@ -981,7 +990,10 @@ static void nlmeans_filter_thread(void *thread_args_v)
                            pv->strength[c],
                            pv->origin_tune[c],
                            pv->patch_size[c],
-                          pv->range[c]);
+                          pv->range[c],
+                          pv->exptable[c],
+                          pv->weight_fact_table[c],
+                          pv->diff_max[c]);
          }
          buf->s = pv->frame[segment].s;
          thread_data->out = buf;
@@ -1015,7 +1027,9 @@ static void nlmeans_add_frame(hb_filter_private_t *pv, hb_buffer_t *buf)
  static hb_buffer_t * nlmeans_filter(hb_filter_private_t *pv)
  {
      if (pv->next_frame < pv->max_frames + pv->thread_count)
+    {
          return NULL;
+    }
  
      taskset_cycle(&pv->taskset);
  
@@ -1102,7 +1116,9 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
  
              int nframes = pv->next_frame - f;
              if (pv->nframes[c] < nframes)
+            {
                  nframes = pv->nframes[c];
+            }
              // Process current plane
              nlmeans_plane(functions,
                            frame,
@@ -1116,7 +1132,10 @@ static hb_buffer_t * nlmeans_filter_flush(hb_filter_private_t *pv)
                            pv->strength[c],
                            pv->origin_tune[c],
                            pv->patch_size[c],
-                          pv->range[c]);
+                          pv->range[c],
+                          pv->exptable[c],
+                          pv->weight_fact_table[c],
+                          pv->diff_max[c]);
          }
          buf->s = frame->s;
          if (out == NULL)
diff --git a/libhb/nlmeans_x86.c b/libhb/nlmeans_x86.c

index 9acba22d402faed929ce7d52965239f40fefef52..685ac857e606770e4b06a497bd825010150ceeb0 100644 (file)
--- a/libhb/nlmeans_x86.c
+++ b/libhb/nlmeans_x86.c
@@ -29,21 +29,15 @@ static void build_integral_sse2(uint32_t *integral,
  {
      const __m128i zero = _mm_set1_epi8(0);
  
-    memset(integral-1 - integral_stride, 0, (w+1) * sizeof(uint32_t));
-
      for (int y = 0; y < h; y++)
      {
          __m128i prevadd = _mm_set1_epi32(0);
  
          const uint8_t *p1 = src_pre + y*src_w;
          const uint8_t *p2 = compare_pre + (y+dy)*compare_w + dx;
-        uint32_t *out = integral + (y*integral_stride) - 1;
-
-        *out++ = 0;
-
-        const int pixels_step = 16;
+        uint32_t *out = integral + (y*integral_stride);
  
-        for (int x = 0; x < w; x += pixels_step)
+        for (int x = 0; x < w; x += 16)
          {
              __m128i pa, pb;
              __m128i pla, plb;
@@ -116,16 +110,16 @@ static void build_integral_sse2(uint32_t *integral,
              _mm_store_si128((__m128i*)(out+12), hhdiff); // Store high diff high in memory
  
              // Increment
-            out += pixels_step;
-            p1  += pixels_step;
-            p2  += pixels_step;
+            out += 16;
+            p1  += 16;
+            p2  += 16;
          }
  
          if (y > 0)
          {
              out = integral + y*integral_stride;
  
-            for (int x = 0; x < w; x += pixels_step)
+            for (int x = 0; x < w; x += 16)
              {
                  *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
                                                   *(__m128i*)(out));
@@ -139,7 +133,7 @@ static void build_integral_sse2(uint32_t *integral,
                  *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
                                                        *(__m128i*)(out+12));
  
-                out += 4*4;
+                out += 16;
              }
          }
      }
author	bradleys <bradley@bradleysepos.com>
	Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)
committer	bradleys <bradley@bradleysepos.com>
	Fri, 6 Feb 2015 11:05:33 +0000 (11:05 +0000)
libhb/nlmeans.c		patch \| blob \| history
libhb/nlmeans_x86.c		patch \| blob \| history