]> granicus.if.org Git - libvpx/commitdiff
Specialize Neon high bitdepth subpel variance by filter value
authorSalome Thirot <salome.thirot@arm.com>
Thu, 9 Feb 2023 14:16:30 +0000 (14:16 +0000)
committerSalome Thirot <salome.thirot@arm.com>
Mon, 13 Feb 2023 20:22:56 +0000 (20:22 +0000)
Use the same specialization as for standard bitdepth. The rationale for
the specialization is as follows:

The optimal implementation of the bilinear interpolation depends on the
filter values being used. For both horizontal and vertical interpolation
this can simplify to just taking the source values, or averaging the
source and reference values - which can be computed more easily than a
bilinear interpolation with arbitrary filter values.

This patch introduces tests to find the most optimal bilinear
interpolation implementation based on the filter values being used.
This new specialization is only used for larger block sizes.

Change-Id: I73182c979255f0332a274f2e5907df7f38c9eeb3

vpx_dsp/arm/highbd_subpel_variance_neon.c

index 07e8ea5107f2e5c4f33b441b42c6c6b5afe4916c..0682eb3a29acec5da825dca54014a641b081d743 100644 (file)
@@ -120,70 +120,175 @@ static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
                                       64, dst_height, filter_offset);
 }
 
-#define HBD_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
-  unsigned int vpx_highbd_8_sub_pixel_variance##w##x##h##_neon(              \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
-    uint16_t tmp0[w * (h + padding)];                                        \
-    uint16_t tmp1[w * h];                                                    \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
-                                                                             \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
-                                       (h + padding), xoffset);              \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
-                                                                             \
-    return vpx_highbd_8_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref,  \
-                                          ref_stride, sse);                  \
-  }                                                                          \
-                                                                             \
-  unsigned int vpx_highbd_10_sub_pixel_variance##w##x##h##_neon(             \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
-    uint16_t tmp0[w * (h + padding)];                                        \
-    uint16_t tmp1[w * h];                                                    \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
-                                                                             \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
-                                       (h + padding), xoffset);              \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
-                                                                             \
-    return vpx_highbd_10_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \
-                                           ref_stride, sse);                 \
-  }                                                                          \
-  unsigned int vpx_highbd_12_sub_pixel_variance##w##x##h##_neon(             \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *ref, int ref_stride, uint32_t *sse) {                   \
-    uint16_t tmp0[w * (h + padding)];                                        \
-    uint16_t tmp1[w * h];                                                    \
-    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                            \
-                                                                             \
-    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,         \
-                                       (h + padding), xoffset);              \
-    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);        \
-                                                                             \
-    return vpx_highbd_12_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), w, ref, \
-                                           ref_stride, sse);                 \
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)                  \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + padding)];                                          \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
+                                       (h + padding), xoffset);                \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)      \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + padding));                          \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + padding));                          \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + padding), xoffset);            \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + padding), xoffset);            \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
   }
 
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
 // padding.
-HBD_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
-HBD_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1)
 
-HBD_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
-HBD_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1)
 
 // Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
 // width 4.