From: Johann <johann.koenig@duck.com>
Date: Wed, 24 Oct 2018 19:22:35 +0000 (-0700)
Subject: vp8 bilinear: rewrite in intrinsics
X-Git-Tag: v1.8.0~207^2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6f35ac956b8315b466c7f66af1172fbc473f1464;p=libvpx

vp8 bilinear: rewrite in intrinsics

8x8 is 15% faster than the assembly. 8x4 is 200% faster than MMX.

Remove MMX version.

Change-Id: I55642ebd276db265911f2c79616177a3a9a7e04f
---

diff --git a/test/predict_test.cc b/test/predict_test.cc
index a4fcd352f..a8fcfc0b1 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -16,6 +16,7 @@
 #include "./vp8_rtcd.h"
 #include "./vpx_config.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -33,7 +34,8 @@ typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
 
 typedef ::testing::tuple<int, int, PredictFunc> PredictParam;
 
-class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
+class PredictTestBase : public AbstractBench,
+                        public ::testing::TestWithParam<PredictParam> {
  public:
   PredictTestBase()
       : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
@@ -204,7 +206,20 @@ class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
       }
     }
   }
-};
+
+  void Run() {
+    for (int xoffset = 0; xoffset < 8; ++xoffset) {
+      for (int yoffset = 0; yoffset < 8; ++yoffset) {
+        if (xoffset == 0 && yoffset == 0) {
+          continue;
+        }
+
+        predict_(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, dst_,
+                 dst_stride_);
+      }
+    }
+  }
+};  // namespace
 
 class SixtapPredictTest : public PredictTestBase {};
 
@@ -341,6 +356,14 @@ TEST_P(BilinearPredictTest, TestWithRandomData) {
 TEST_P(BilinearPredictTest, TestWithUnalignedDst) {
   TestWithUnalignedDst(vp8_bilinear_predict16x16_c);
 }
+TEST_P(BilinearPredictTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 5000000 / (width_ * height_);
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", width_, height_);
+  PrintMedian(title);
+}
 
 INSTANTIATE_TEST_CASE_P(
     C, BilinearPredictTest,
@@ -359,14 +382,14 @@ INSTANTIATE_TEST_CASE_P(
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(
     MMX, BilinearPredictTest,
-    ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
-                      make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
+    ::testing::Values(make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
 #endif
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),
-                      make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2)));
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2),
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2)));
 #endif
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 3df745f75..f67025767 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -164,7 +164,7 @@ add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch,
 specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
 
 add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
+specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;
 
 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;
diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c
new file mode 100644
index 000000000..db91bdca4
--- /dev/null
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <xmmintrin.h>
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
+                                  const int xoffset, const int height) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < height; ++h) {
+      const __m128i a = _mm_loadl_epi64((__m128i *)src);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      _mm_store_si128((__m128i *)dst, a_u16);
+      src += stride;
+      dst += 8;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    // Filter horizontally. Rather than load the whole array and transpose, load
+    // 16 values (overreading) and shift to set up the second value. Do an
+    // "extra" 9th line so the vertical pass has the necessary context.
+    for (h = 0; h < height; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i b = _mm_srli_si128(a, 1);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+      const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+      const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+      const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      _mm_store_si128((__m128i *)dst, shifted);
+      src += stride;
+      dst += 8;
+    }
+  }
+}
+
+static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
+                                const int yoffset, const int height) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < height; ++h) {
+      const __m128i row = _mm_load_si128((__m128i *)src);
+      const __m128i packed = _mm_packus_epi16(row, row);
+      _mm_storel_epi64((__m128i *)dst, packed);
+      src += 8;
+      dst += stride;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    __m128i row_0 = _mm_load_si128((__m128i *)src);
+    src += 8;
+    for (h = 0; h < height; ++h) {
+      const __m128i row_1 = _mm_load_si128((__m128i *)src);
+      const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+      const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+      const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      const __m128i packed = _mm_packus_epi16(shifted, shifted);
+      _mm_storel_epi64((__m128i *)dst, packed);
+      row_0 = row_1;
+      src += 8;
+      dst += stride;
+    }
+  }
+}
+
+void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  uint16_t FData[8 * 9];
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
+
+  vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
+}
+
+void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  uint16_t FData[8 * 5];
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
+
+  vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
+}
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 1f3a2baca..05320d58d 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -205,161 +205,6 @@ sym(vp8_filter_block1dc_v6_mmx):
     ret
 
 
-;void bilinear_predict8x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
-sym(vp8_bilinear_predict8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        shl         rax,        5
-
-        mov         rsi,        arg(0) ;src_ptr              ;
-        add         rax,        rcx
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x4:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void bilinear_predict4x4_mmx
 ;(
 ;    unsigned char  *src_ptr,
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index 6e70f6d2e..f1ec55b27 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -1226,151 +1226,6 @@ sym(vp8_bilinear_predict16x16_sse2):
     pop         rbp
     ret
 
-
-;void vp8_bilinear_predict8x8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
-sym(vp8_bilinear_predict8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm5,       [rax]
-        movdqa      xmm6,       [rax+16]
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       XMMWORD PTR [rsp]
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        add         rsp,        16                 ; next line
-.next_row8x8:
-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-        pmullw      xmm7,       xmm5
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm4,       xmm3
-
-        pmullw      xmm3,       xmm6
-        paddw       xmm3,       xmm7
-
-        movdqa      xmm7,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    xmm3,       xmm0
-        movq        [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsp,        16                 ; next line
-        add         rdi,        rdx
-
-        cmp         rdi,        rcx
-        jne         .next_row8x8
-
-    ;add rsp, 144
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd:
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 246fe6a67..d2d5712a5 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -86,6 +86,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/bilinear_filter_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm