From: Johann <johann.koenig@duck.com>
Date: Wed, 24 Oct 2018 22:48:32 +0000 (-0700)
Subject: vp8 bilinear: rewrite 16x16
X-Git-Tag: v1.8.0~205^2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ad0ed535a702ebe4164c0bf5fd5f3211269f1fad;p=libvpx

vp8 bilinear: rewrite 16x16

Marginally faster. Most importantly it drops a dependency on an
external symbol (vp8_bilinear_filters_x86_8).

Change-Id: Iff022e718720f1f0eeced6201a1ad69a9c9c4f45
---

diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c
index db91bdca4..224c1b32a 100644
--- a/vp8/common/x86/bilinear_filter_sse2.c
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -12,9 +12,133 @@
 #include <xmmintrin.h>
 
 #include "./vp8_rtcd.h"
+#include "./vpx_config.h"
 #include "vp8/common/filter.h"
 #include "vpx_ports/mem.h"
 
+static INLINE void horizontal_16x16(uint8_t *src, const int stride,
+                                    uint16_t *dst, const int xoffset) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < 17; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+      const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+      _mm_store_si128((__m128i *)dst, a_lo);
+      _mm_store_si128((__m128i *)(dst + 8), a_hi);
+      src += stride;
+      dst += 16;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    for (h = 0; h < 17; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+      const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+      const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
+      const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
+
+      const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
+      const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
+      const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
+      const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
+      const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
+
+      const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
+      const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
+
+      const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+      const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+      const __m128i shifted_lo =
+          _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+      const __m128i shifted_hi =
+          _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+      _mm_store_si128((__m128i *)dst, shifted_lo);
+      _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
+      src += stride;
+      dst += 16;
+    }
+  }
+}
+
+static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
+                                  const int yoffset) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < 16; ++h) {
+      const __m128i row_lo = _mm_load_si128((__m128i *)src);
+      const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
+      const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
+      _mm_store_si128((__m128i *)dst, packed);
+      src += 16;
+      dst += stride;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    __m128i row_0_lo = _mm_load_si128((__m128i *)src);
+    __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
+    src += 16;
+    for (h = 0; h < 16; ++h) {
+      const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
+      const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
+
+      const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
+      const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
+      const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
+      const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
+
+      const __m128i sum_lo =
+          _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
+      const __m128i sum_hi =
+          _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
+
+      const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+      const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+      const __m128i shifted_lo =
+          _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+      const __m128i shifted_hi =
+          _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+      const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
+      _mm_store_si128((__m128i *)dst, packed);
+      row_0_lo = row_1_lo;
+      row_0_hi = row_1_hi;
+      src += 16;
+      dst += stride;
+    }
+  }
+}
+
+void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                    int xoffset, int yoffset, uint8_t *dst_ptr,
+                                    int dst_pitch) {
+  uint16_t FData[16 * 17];
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
+
+  vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
+}
+
 static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
                                   const int xoffset, const int height) {
   int h;
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index f1ec55b27..51c015e3d 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -10,7 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define VP8_FILTER_WEIGHT 128
@@ -958,274 +957,6 @@ sym(vp8_unpack_block1d16_h6_sse2):
     ret
 
 
-;void vp8_bilinear_predict16x16_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp8_bilinear_filters_x86_8)
-global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
-sym(vp8_bilinear_predict16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-
-        cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-%endif
-        ; get the first horizontal line done
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        add         rsi,        rdx                 ; next line
-.next_row:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       [rax]
-        pmullw      xmm6,       [rax]
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        pmullw      xmm3,       [rax+16]
-        pmullw      xmm4,       [rax+16]
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rdx                 ; next line
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5) ;dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        add         rsi,        rax                 ; next line
-.next_row_spo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        movdqa      xmm7,       xmm3
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm5,       xmm1
-        pmullw      xmm6,       xmm1
-        pmullw      xmm3,       xmm2
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ;dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_spo
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-        pxor        xmm0,       xmm0
-
-.next_row_fpo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ; dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_fpo
-
-.done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 rd: