From abd00505d1c658cc106bad51369197270a299f92 Mon Sep 17 00:00:00 2001
From: Geza Lore <gezalore@gmail.com>
Date: Fri, 12 Feb 2016 16:04:35 +0000
Subject: [PATCH] Add optimized vpx_sum_squares_2d_i16 for vp10.

Using this we can eliminate large numbers of calls to predict intra,
and is also faster than most of the variance functions it replaces.
This is an equivalence transform so coding performance is unaffected.

Encoder speedup is approx 7% when var_tx, super_tx and ext_tx are all
enabled.

Change-Id: I0d4c83afc4a97a1826f3abd864bd68e41bb504fb
---
 test/sum_squares_test.cc       | 133 +++++++++++++++++++++++++++++++++
 test/test.mk                   |   2 +-
 vp10/encoder/rdopt.c           |  54 ++++++-------
 vpx_dsp/sum_squares.c          |  29 +++++++
 vpx_dsp/vpx_dsp.mk             |   7 +-
 vpx_dsp/vpx_dsp_common.h       |  12 +++
 vpx_dsp/vpx_dsp_rtcd_defs.pl   |   8 ++
 vpx_dsp/x86/sum_squares_sse2.c | 119 +++++++++++++++++++++++++++++
 8 files changed, 336 insertions(+), 28 deletions(-)
 create mode 100644 test/sum_squares_test.cc
 create mode 100644 vpx_dsp/sum_squares.c
 create mode 100644 vpx_dsp/x86/sum_squares_sse2.c
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
new file mode 100644
index 000000000..7de7a814f
--- /dev/null
+++ b/test/sum_squares_test.cc
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int kNumIterations = 10000;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src,
+                             int stride ,
+                             int size);
+
+typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;
+
+class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
+ public:
+  virtual ~SumSquaresTest() {}
+  virtual void SetUp() {
+    ref_func_ = GET_PARAM(0);
+    tst_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  SSI16Func ref_func_;
+  SSI16Func tst_func_;
+};
+
+TEST_P(SumSquaresTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, int16_t, src[256*256]);
+
+  int failed = 0;
+
+  const int msb = 11;   // Up to 12 bit input
+  const int limit = 1 << (msb+1);
+
+  for (int k = 0; k < kNumIterations; k++) {
+    int size = 4 << rnd(6);     // Up to 128x128
+    int stride = 4 << rnd(7);   // Up to 256 stride
+    while (stride < size) {     // Make sure it's valid
+      stride = 4 << rnd(7);
+    }
+
+    for (int ii = 0 ; ii < size; ii++) {
+      for (int jj = 0; jj < size; jj++) {
+        src[ii*stride+jj] = rnd(2) ? rnd(limit) : -rnd(limit);
+      }
+    }
+
+    uint64_t res_ref = ref_func_(src, stride, size);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+        << "Error: Sum Squares Test"
+        << " C output does not match optimized output.";
+    }
+  }
+}
+
+TEST_P(SumSquaresTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, int16_t, src[256*256]);
+
+  int failed = 0;
+
+  const int msb = 11;   // Up to 12 bit input
+  const int limit = 1 << (msb+1);
+
+  for (int k = 0; k < kNumIterations; k++) {
+    int size = 4 << rnd(6);     // Up to 128x128
+    int stride = 4 << rnd(7);   // Up to 256 stride
+    while (stride < size) {     // Make sure it's valid
+      stride = 4 << rnd(7);
+    }
+
+    int val = rnd(2) ? limit-1 : -(limit-1);
+    for (int ii = 0 ; ii < size; ii++) {
+      for (int jj = 0; jj < size; jj++) {
+        src[ii*stride+jj] = val;
+      }
+    }
+
+    uint64_t res_ref = ref_func_(src, stride, size);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+        << "Error: Sum Squares Test"
+        << " C output does not match optimized output.";
+    }
+  }
+}
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SumSquaresTest,
+    ::testing::Values(
+        make_tuple(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_sse2)
+    )
+);
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/test/test.mk b/test/test.mk
index 7926caeeb..a73ebd986 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -163,11 +163,11 @@ endif # VP9
 
 ## VP10
 ifeq ($(CONFIG_VP10),yes)
-
 LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ANS)          += vp10_ans_test.cc
 
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
 endif # VP10
 
 ## Multi-codec / unconditional whitebox tests.
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 0a94733dc..9dc1ebd64 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -654,27 +654,31 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   if (!is_inter_block(mbmi)) {
     struct encode_b_args arg = {x, NULL, &mbmi->skip};
 #if CONFIG_VAR_TX
-    uint8_t *dst, *src;
-    int src_stride = x->plane[plane].src.stride;
-    int dst_stride = xd->plane[plane].dst.stride;
-    unsigned int tmp_sse;
-    PREDICTION_MODE mode = (plane == 0) ?
-        get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-
-    src = &x->plane[plane].src.buf[4 * (blk_row * src_stride + blk_col)];
-    dst = &xd->plane[plane].dst.buf[4 * (blk_row * dst_stride + blk_col)];
-    vp10_predict_intra_block(xd, b_width_log2_lookup[plane_bsize],
-                             b_height_log2_lookup[plane_bsize],
-                             tx_size, mode, dst, dst_stride,
-                             dst, dst_stride, blk_col, blk_row, plane);
-    args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
-                                                   dst, dst_stride, &tmp_sse);
-    sse = (int64_t)tmp_sse * 16;
     vp10_encode_block_intra(plane, block, blk_row, blk_col,
                             plane_bsize, tx_size, &arg);
-    args->cpi->fn_ptr[txsize_to_bsize[tx_size]].vf(src, src_stride,
-                                                   dst, dst_stride, &tmp_sse);
-    dist = (int64_t)tmp_sse * 16;
+
+    {
+      const int bs = 4 << tx_size;
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+      const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
+
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+      const int src_stride = p->src.stride;
+      const int dst_stride = pd->dst.stride;
+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+
+      const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+      unsigned int tmp;
+
+      sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, bs) * 16;
+      variance(src, src_stride, dst, dst_stride, &tmp);
+      dist = (int64_t)tmp * 16;
+    }
 #else
     vp10_encode_block_intra(plane, block, blk_row, blk_col,
                             plane_bsize, tx_size, &arg);
@@ -2330,6 +2334,8 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
 #else
   DECLARE_ALIGNED(16, uint8_t, rec_buffer[32 * 32]);
 #endif
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
   int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
   int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
@@ -2360,20 +2366,16 @@ void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
   if (blk_row + (bh >> 2) > max_blocks_high ||
       blk_col + (bh >> 2) > max_blocks_wide) {
     int idx, idy;
-    unsigned int this_sse;
     int blocks_height = VPXMIN(bh >> 2, max_blocks_high - blk_row);
     int blocks_width  = VPXMIN(bh >> 2, max_blocks_wide - blk_col);
     for (idy = 0; idy < blocks_height; idy += 2) {
       for (idx = 0; idx < blocks_width; idx += 2) {
-        cpi->fn_ptr[BLOCK_8X8].vf(src + 4 * idy * src_stride + 4 * idx,
-                                  src_stride,
-                                  rec_buffer + 4 * idy * 32 + 4 * idx,
-                                  32, &this_sse);
-        tmp_sse += this_sse;
+        const int16_t *d = diff + 4 * idy * diff_stride + 4 * idx;
+        tmp_sse += vpx_sum_squares_2d_i16(d, diff_stride, 8);
       }
     }
   } else {
-    cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, 32, &tmp_sse);
+    tmp_sse = vpx_sum_squares_2d_i16(diff, diff_stride, bh);
   }
 
   *bsse += (int64_t)tmp_sse * 16;
diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c
new file mode 100644
index 000000000..8a5a3d985
--- /dev/null
+++ b/vpx_dsp/sum_squares.c
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
+                                  int size) {
+  int r, c;
+  uint64_t ss = 0;
+
+  for (r = 0; r < size; r++) {
+    for (c = 0; c < size; c++) {
+      const int16_t v = src[c];
+      ss += v*v;
+    }
+    src  += src_stride;
+  }
+
+  return ss;
+}
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index e394688c7..a44f948fa 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -266,6 +266,12 @@ endif
 
 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
+ifeq ($(CONFIG_VP10_ENCODER),yes)
+DSP_SRCS-yes            += sum_squares.c
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
+endif # CONFIG_VP10_ENCODER
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
@@ -297,7 +303,6 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_USE_X86INC
-
 endif  # CONFIG_ENCODERS
 
 ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index a9e180e79..b4e6f4c27 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -23,6 +23,18 @@ extern "C" {
 #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
 
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+# define LIKELY(v)    __builtin_expect(v, 1)
+# define UNLIKELY(v)  __builtin_expect(v, 0)
+#else
+# define LIKELY(v)    (v)
+# define UNLIKELY(v)  (v)
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 73726d217..8168b482a 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -954,6 +954,14 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
 
+if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+#
+# Sum of Squares
+#
+  add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
+  specialize qw/vpx_sum_squares_2d_i16 sse2/;
+}
+
 #
 # Single block SAD
 #
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 000000000..ed1dc0c53
--- /dev/null
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
+                                                int stride) {
+  const __m128i v_val_0_w = _mm_loadl_epi64((const __m128i*)(src+0*stride));
+  const __m128i v_val_1_w = _mm_loadl_epi64((const __m128i*)(src+1*stride));
+  const __m128i v_val_2_w = _mm_loadl_epi64((const __m128i*)(src+2*stride));
+  const __m128i v_val_3_w = _mm_loadl_epi64((const __m128i*)(src+3*stride));
+
+  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+  const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d,
+                                        _mm_srli_epi64(v_sum_0123_d, 32));
+
+  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
+                                                int stride,
+                                                int size) {
+  int r, c;
+
+  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  __m128i v_acc_q = _mm_setzero_si128();
+
+  for (r = 0; r < size; r += 8) {
+    __m128i v_acc_d = _mm_setzero_si128();
+
+    for (c = 0; c < size; c += 8) {
+      const int16_t *b = src+c;
+
+      const __m128i v_val_0_w = _mm_load_si128((const __m128i*)(b+0*stride));
+      const __m128i v_val_1_w = _mm_load_si128((const __m128i*)(b+1*stride));
+      const __m128i v_val_2_w = _mm_load_si128((const __m128i*)(b+2*stride));
+      const __m128i v_val_3_w = _mm_load_si128((const __m128i*)(b+3*stride));
+      const __m128i v_val_4_w = _mm_load_si128((const __m128i*)(b+4*stride));
+      const __m128i v_val_5_w = _mm_load_si128((const __m128i*)(b+5*stride));
+      const __m128i v_val_6_w = _mm_load_si128((const __m128i*)(b+6*stride));
+      const __m128i v_val_7_w = _mm_load_si128((const __m128i*)(b+7*stride));
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+    }
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+    src += 8*stride;
+  }
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i*)&tmp, v_acc_q);
+    return tmp;
+  }
+#endif
+}
+
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride,
+                                     int size) {
+  // 4 elements per row only requires half an XMM register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(size == 4)) {
+    return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else {
+  // Generic case
+    return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
+  }
+}
-- 
2.50.1