]> granicus.if.org Git - libvpx/commitdiff
add vp9_satd_neon
authorJames Zern <jzern@google.com>
Fri, 20 Nov 2015 07:39:10 +0000 (23:39 -0800)
committerJames Zern <jzern@google.com>
Wed, 25 Nov 2015 00:09:10 +0000 (16:09 -0800)
~60-65% faster at the function level across block sizes

Change-Id: Iaf8cbe95731c43fdcbf68256e44284ba51a93893

test/vp9_avg_test.cc
vp9/common/vp9_rtcd_defs.pl
vp9/encoder/arm/neon/vp9_avg_neon.c

index 1a9b43062be7f9ec477c37c755f2fea6c10b5b13..290bdc75e9cdd54104323b01b8e72e8eabfadb29 100644 (file)
@@ -385,6 +385,14 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
         make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),
         make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, SatdTest,
+    ::testing::Values(
+        make_tuple(16, &vp9_satd_neon),
+        make_tuple(64, &vp9_satd_neon),
+        make_tuple(256, &vp9_satd_neon),
+        make_tuple(1024, &vp9_satd_neon)));
 #endif
 
 #if HAVE_MSA
index 7a2883aba3e21cdfa94b6acd2b3178f8ef9515e5..8fe6503aa3d0e44f3b36b1b558ea889e143b99fa 100644 (file)
@@ -210,7 +210,7 @@ add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride,
 specialize qw/vp9_hadamard_16x16 sse2/;
 
 add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";
-specialize qw/vp9_satd sse2/;
+specialize qw/vp9_satd sse2 neon/;
 
 add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
 specialize qw/vp9_int_pro_row sse2 neon/;
index d569ec95d37f685b625af342f6b727ef9d41d3ea..5996bd426b26b3fdaeb6f027223b723410e9985e 100644 (file)
@@ -50,6 +50,33 @@ unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
   return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }
 
+// coeff: 16 bits, dynamic range [-32640, 32640].
+// length: value range {16, 64, 256, 1024}.
+int vp9_satd_neon(const int16_t *coeff, int length) {
+  const int16x4_t zero = vdup_n_s16(0);
+  int32x4_t accum = vdupq_n_s32(0);
+
+  do {
+    const int16x8_t src0 = vld1q_s16(coeff);
+    const int16x8_t src8 = vld1q_s16(coeff + 8);
+    accum = vabal_s16(accum, vget_low_s16(src0), zero);
+    accum = vabal_s16(accum, vget_high_s16(src0), zero);
+    accum = vabal_s16(accum, vget_low_s16(src8), zero);
+    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    length -= 16;
+    coeff += 16;
+  } while (length != 0);
+
+  {
+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
+                                  vreinterpret_s32_s64(vget_high_s64(s0)));
+    const int satd = vget_lane_s32(s1, 0);
+    return satd;
+  }
+}
+
 void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;