From 5caec339bea5e37ae8707f9d783feb2aa51cd111 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Thu, 25 Oct 2018 12:23:03 -0700
Subject: [PATCH] vp8 bilinear: rewrite 4x4

~20% faster than the MMX. Removes the last usage of
vp8_bilinear_filters_x86_[48].

Change-Id: Iee976fab9655d0020440f26c4403ce50103af913
---
 test/predict_test.cc                  |   8 +-
 vp8/common/rtcd_defs.pl               |   2 +-
 vp8/common/x86/bilinear_filter_sse2.c |  93 ++++++++++++++++++++
 vp8/common/x86/filter_x86.c           |  29 ------
 vp8/common/x86/filter_x86.h           |  33 -------
 vp8/common/x86/subpixel_mmx.asm       | 121 --------------------------
 vp8/common/x86/vp8_asm_stubs.c        |   1 -
 vp8/vp8_common.mk                     |   2 -
 vpx_dsp/x86/mem_sse2.h                |  11 +++
 9 files changed, 107 insertions(+), 193 deletions(-)
 delete mode 100644 vp8/common/x86/filter_x86.c
 delete mode 100644 vp8/common/x86/filter_x86.h

diff --git a/test/predict_test.cc b/test/predict_test.cc
index a8fcfc0b1..1c2f2d11d 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -379,17 +379,13 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(8, 4, &vp8_bilinear_predict8x4_neon),
                       make_tuple(4, 4, &vp8_bilinear_predict4x4_neon)));
 #endif
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(
-    MMX, BilinearPredictTest,
-    ::testing::Values(make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
-#endif
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),
                       make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2),
-                      make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2)));
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2),
+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2)));
 #endif
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index f67025767..3ab89a338 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -167,7 +167,7 @@ add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch,
 specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;
 
 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;
+specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/;
 
 #
 # Encoder functions below this point.
diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c
index 224c1b32a..14e10eca4 100644
--- a/vp8/common/x86/bilinear_filter_sse2.c
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -14,6 +14,7 @@
 #include "./vp8_rtcd.h"
 #include "./vpx_config.h"
 #include "vp8/common/filter.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 #include "vpx_ports/mem.h"
 
 static INLINE void horizontal_16x16(uint8_t *src, const int stride,
@@ -241,3 +242,95 @@ void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
 
   vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
 }
+
+static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
+                                  const int xoffset) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < 5; ++h) {
+      const __m128i a = load_unaligned_u32(src);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      _mm_storel_epi64((__m128i *)dst, a_u16);
+      src += stride;
+      dst += 4;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    for (h = 0; h < 5; ++h) {
+      const __m128i a = load_unaligned_u32(src);
+      const __m128i b = load_unaligned_u32(src + 1);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+      const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+      const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+      const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      _mm_storel_epi64((__m128i *)dst, shifted);
+      src += stride;
+      dst += 4;
+    }
+  }
+}
+
+static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
+                                const int yoffset) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < 4; h += 2) {
+      const __m128i row = _mm_load_si128((__m128i *)src);
+      __m128i packed = _mm_packus_epi16(row, row);
+      store_unaligned_u32(dst, packed);
+      dst += stride;
+      packed = _mm_srli_si128(packed, 4);
+      store_unaligned_u32(dst, packed);
+      dst += stride;
+      src += 8;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    for (h = 0; h < 4; h += 2) {
+      const __m128i row_0 = _mm_load_si128((__m128i *)src);
+      const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
+      const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+      const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+      const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      __m128i packed = _mm_packus_epi16(shifted, shifted);
+      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      packed = _mm_srli_si128(packed, 4);
+      dst += stride;
+      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      dst += stride;
+      src += 8;
+    }
+  }
+}
+
+void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  uint16_t FData[4 * 5];
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
+
+  vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
+}
diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c
deleted file mode 100644
index 2405342f0..000000000
--- a/vp8/common/x86/filter_x86.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/x86/filter_x86.h"
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {
-  { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 },
-  { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 },
-  { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 },
-  { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
-  { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
-  { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
-  { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
-  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-  { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
-  { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
-  { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
diff --git a/vp8/common/x86/filter_x86.h b/vp8/common/x86/filter_x86.h
deleted file mode 100644
index 570ff8666..000000000
--- a/vp8/common/x86/filter_x86.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_VP8_COMMON_X86_FILTER_X86_H_
-#define VPX_VP8_COMMON_X86_FILTER_X86_H_
-
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
- * duplicated values */
-
-/* duplicated 4x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
-
-/* duplicated 8x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_VP8_COMMON_X86_FILTER_X86_H_
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 05320d58d..67bcd0cbd 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -10,8 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
-
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define vp8_filter_weight 128
@@ -205,125 +203,6 @@ sym(vp8_filter_block1dc_v6_mmx):
     ret
 
 
-;void bilinear_predict4x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
-sym(vp8_bilinear_predict4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        shl         rax,        5
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm0                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_4x4:
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-
-        movq        mm5,        mm7                 ;
-        punpcklbw   mm5,        mm0                 ;
-
-        pmullw      mm5,        [rax]               ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-        movq        mm7,        mm3                 ;
-
-        packuswb    mm7,        mm0                 ;
-
-        pmullw      mm3,        [rax+16]            ;
-        paddw       mm3,        mm5                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    mm3,        mm0
-        movd        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_4x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 SECTION_RODATA
 align 16
 rd:
diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index de836f19d..7fb83c2d5 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c
@@ -11,7 +11,6 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "filter_x86.h"
 
 extern const short vp8_six_tap_x86[8][6 * 8];
 
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index d2d5712a5..9f106a2c3 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -70,8 +70,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 VP8_COMMON_SRCS-yes += common/treecoder.c
 
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 5209a0628..258ab38e6 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -26,6 +26,17 @@ static INLINE uint32_t loadu_uint32(const void *src) {
   return v;
 }
 
+static INLINE __m128i load_unaligned_u32(const void *a) {
+  uint32_t val;
+  memcpy(&val, a, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
+  const uint32_t val = _mm_cvtsi128_si32(v);
+  memcpy(a, &val, sizeof(val));
+}
+
 #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
 #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
 
-- 
2.50.1