From: Angie Chiang <angiebird@google.com>
Date: Fri, 11 Mar 2016 19:57:30 +0000 (+0000)
Subject: Merge "convolve8 sse2 test" into nextgenv2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c0f708c03a3e10f7bb92b0ab59220259a83d8962;p=libvpx

Merge "convolve8 sse2 test" into nextgenv2
---

c0f708c03a3e10f7bb92b0ab59220259a83d8962
diff --cc test/vp10_convolve_test.cc
index 07b0dda42,c22064b44..122a8e502
--- a/test/vp10_convolve_test.cc
+++ b/test/vp10_convolve_test.cc
@@@ -239,35 -223,157 +240,163 @@@ TEST(VP10ConvolveTest, vp10_highbd_conv
      src1[i] = rnd.Rand16() % (1 << bd);
    }
  
 -  int offset = filter_size * filter_center + filter_center;
 -
 -  avg = 0;
 -  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
 -                       CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
 -                       filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
 -                       y_step_q4, avg, bd);
 -  avg = 0;
 -  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
 -                       CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
 -                       filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
 -                       y_step_q4, avg, bd);
 -
 -  avg = 0;
 -  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
 -                       CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, filter_params,
 -                       subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
 -  avg = 1;
 -  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
 -                       CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, filter_params,
 -                       subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
 -
 -  EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
 +  for (subpel_x_q4 = 0; subpel_x_q4 < 16; subpel_x_q4++) {
 +    for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
 +      int offset = filter_size * filter_center + filter_center;
 +
 +      avg = 0;
 +      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
 +                           CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
 +                           filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
 +                           y_step_q4, avg, bd);
 +      avg = 0;
 +      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
 +                           CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
 +                           filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
 +                           y_step_q4, avg, bd);
 +
 +      avg = 0;
 +      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
 +                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
 +                           filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
 +                           y_step_q4, avg, bd);
 +      avg = 1;
 +      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
 +                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
 +                           filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
 +                           y_step_q4, avg, bd);
 +
 +      EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
 +    }
 +  }
  }
  #endif  // CONFIG_VP9_HIGHBITDEPTH
+ 
+ #define CONVOLVE_SPEED_TEST 0
+ #if CONVOLVE_SPEED_TEST
+ #define highbd_convolve_speed(func, block_size, frame_size)                  \
+   TEST(VP10ConvolveTest, func##_speed_##block_size##_##frame_size) {         \
+     ACMRandom rnd(ACMRandom::DeterministicSeed());                           \
+     INTERP_FILTER interp_filter = EIGHTTAP;                                  \
+     InterpFilterParams filter_params =                                       \
+         vp10_get_interp_filter_params(interp_filter);                        \
+     ptrdiff_t filter_size = filter_params.tap;                               \
+     int filter_center = filter_size / 2 - 1;                                 \
+     DECLARE_ALIGNED(16, uint16_t,                                            \
+                     src[(frame_size + 7) * (frame_size + 7)]) = {0};         \
+     int src_stride = frame_size + 7;                                         \
+     DECLARE_ALIGNED(16, uint16_t, dst[frame_size * frame_size]) = {0};       \
+     int dst_stride = frame_size;                                             \
+     int x_step_q4 = 16;                                                      \
+     int y_step_q4 = 16;                                                      \
+     int subpel_x_q4 = 8;                                                     \
+     int subpel_y_q4 = 6;                                                     \
+     int bd = 10;                                                             \
+                                                                              \
+     int w = block_size;                                                      \
+     int h = block_size;                                                      \
+                                                                              \
+     const int16_t* filter_x =                                                \
+         vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);           \
+     const int16_t* filter_y =                                                \
+         vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);           \
+                                                                              \
+     for (int i = 0; i < src_stride * src_stride; i++) {                      \
+       src[i] = rnd.Rand16() % (1 << bd);                                     \
+     }                                                                        \
+                                                                              \
+     int offset = filter_center * src_stride + filter_center;                 \
+     int row_offset = 0;                                                      \
+     int col_offset = 0;                                                      \
+     for (int i = 0; i < 100000; i++) {                                       \
+       int src_total_offset = offset + col_offset * src_stride + row_offset;  \
+       int dst_total_offset = col_offset * dst_stride + row_offset;           \
+       func(CONVERT_TO_BYTEPTR(src + src_total_offset), src_stride,           \
+            CONVERT_TO_BYTEPTR(dst + dst_total_offset), dst_stride, filter_x, \
+            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+       if (offset + w + w < frame_size) {                                     \
+         row_offset += w;                                                     \
+       } else {                                                               \
+         row_offset = 0;                                                      \
+         col_offset += h;                                                     \
+       }                                                                      \
+       if (col_offset + h >= frame_size) {                                    \
+         col_offset = 0;                                                      \
+       }                                                                      \
+     }                                                                        \
+   }
+ 
+ #define lowbd_convolve_speed(func, block_size, frame_size)                  \
+   TEST(VP10ConvolveTest, func##_speed_l_##block_size##_##frame_size) {      \
+     ACMRandom rnd(ACMRandom::DeterministicSeed());                          \
+     INTERP_FILTER interp_filter = EIGHTTAP;                                 \
+     InterpFilterParams filter_params =                                      \
+         vp10_get_interp_filter_params(interp_filter);                       \
+     ptrdiff_t filter_size = filter_params.tap;                              \
+     int filter_center = filter_size / 2 - 1;                                \
+     DECLARE_ALIGNED(16, uint8_t, src[(frame_size + 7) * (frame_size + 7)]); \
+     int src_stride = frame_size + 7;                                        \
+     DECLARE_ALIGNED(16, uint8_t, dst[frame_size * frame_size]);             \
+     int dst_stride = frame_size;                                            \
+     int x_step_q4 = 16;                                                     \
+     int y_step_q4 = 16;                                                     \
+     int subpel_x_q4 = 8;                                                    \
+     int subpel_y_q4 = 6;                                                    \
+     int bd = 8;                                                             \
+                                                                             \
+     int w = block_size;                                                     \
+     int h = block_size;                                                     \
+                                                                             \
+     const int16_t* filter_x =                                               \
+         vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);          \
+     const int16_t* filter_y =                                               \
+         vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);          \
+                                                                             \
+     for (int i = 0; i < src_stride * src_stride; i++) {                     \
+       src[i] = rnd.Rand16() % (1 << bd);                                    \
+     }                                                                       \
+                                                                             \
+     int offset = filter_center * src_stride + filter_center;                \
+     int row_offset = 0;                                                     \
+     int col_offset = 0;                                                     \
+     for (int i = 0; i < 100000; i++) {                                      \
+       func(src + offset, src_stride, dst, dst_stride, filter_x, x_step_q4,  \
+            filter_y, y_step_q4, w, h);                                      \
+       if (offset + w + w < frame_size) {                                    \
+         row_offset += w;                                                    \
+       } else {                                                              \
+         row_offset = 0;                                                     \
+         col_offset += h;                                                    \
+       }                                                                     \
+       if (col_offset + h >= frame_size) {                                   \
+         col_offset = 0;                                                     \
+       }                                                                     \
+     }                                                                       \
+   }
+ 
+ // This experiment shows that when frame size is 64x64
+ // vpx_highbd_convolve8_sse2 and vpx_convolve8_sse2's speed are similar.
+ // However when frame size becomes 1024x1024
+ // vpx_highbd_convolve8_sse2 is around 50% slower than vpx_convolve8_sse2
+ // we think the bottleneck is from memory IO
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 8, 64);
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 16, 64);
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 32, 64);
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 64, 64);
+ 
+ lowbd_convolve_speed(vpx_convolve8_sse2, 8, 64);
+ lowbd_convolve_speed(vpx_convolve8_sse2, 16, 64);
+ lowbd_convolve_speed(vpx_convolve8_sse2, 32, 64);
+ lowbd_convolve_speed(vpx_convolve8_sse2, 64, 64);
+ 
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 8, 1024);
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 16, 1024);
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 32, 1024);
+ highbd_convolve_speed(vpx_highbd_convolve8_sse2, 64, 1024);
+ 
+ lowbd_convolve_speed(vpx_convolve8_sse2, 8, 1024);
+ lowbd_convolve_speed(vpx_convolve8_sse2, 16, 1024);
+ lowbd_convolve_speed(vpx_convolve8_sse2, 32, 1024);
+ lowbd_convolve_speed(vpx_convolve8_sse2, 64, 1024);
+ #endif  // CONVOLVE_SPEED_TEST
  }  // namespace