From c67a2e76a1b317995fc6f7fe40ba773ea55272ba Mon Sep 17 00:00:00 2001 From: Johann Date: Wed, 19 Dec 2018 18:09:11 -0800 Subject: [PATCH] subpixel_8t sse2: resolve missing declarations vpx_asm_stubs.c only references these sse2 functions. Combine the files similar to the way the ssse3/avx2 files are set up. Mark the intrinsics as static because they are only used within the macros here. It is unfortunate that the assembly functions can not be marked static as well. BUG=webm:1584 Change-Id: I342687a1046ae6ca46ae58644a7c170440de1dfb --- vpx_dsp/vpx_dsp.mk | 1 - vpx_dsp/x86/vpx_asm_stubs.c | 194 ---------------- vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm | 12 +- vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 261 ++++++++++++++++++---- 4 files changed, 223 insertions(+), 245 deletions(-) delete mode 100644 vpx_dsp/x86/vpx_asm_stubs.c diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 2495db3f4..87460bedf 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -89,7 +89,6 @@ DSP_SRCS-yes += vpx_convolve.h DSP_SRCS-yes += vpx_filter.h DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c deleted file mode 100644 index 9d6f83787..000000000 --- a/vpx_dsp/x86/vpx_asm_stubs.c +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/x86/convolve.h" - -#if HAVE_SSE2 -filter8_1dfunction vpx_filter_block1d16_v8_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_sse2; -filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; - -filter8_1dfunction vpx_filter_block1d16_h4_sse2; -filter8_1dfunction vpx_filter_block1d16_v4_sse2; -filter8_1dfunction vpx_filter_block1d8_h4_sse2; -filter8_1dfunction vpx_filter_block1d8_v4_sse2; -filter8_1dfunction vpx_filter_block1d4_h4_sse2; -filter8_1dfunction vpx_filter_block1d4_v4_sse2; -#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2 -#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2 -#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2 -#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2 -#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2 -#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2 - -filter8_1dfunction vpx_filter_block1d16_v2_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_sse2; -filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; -filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; - -// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, -// int y_step_q4, int w, int h); -// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); - -// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_, sse2); - -#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; - -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v4_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h4_sse2; -#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ - vpx_highbd_filter_block1d16_v8_avg_sse2 -#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ - vpx_highbd_filter_block1d16_h8_avg_sse2 -#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ - vpx_highbd_filter_block1d8_v8_avg_sse2 -#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ - vpx_highbd_filter_block1d8_h8_avg_sse2 -#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ - vpx_highbd_filter_block1d4_v8_avg_sse2 -#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ - vpx_highbd_filter_block1d4_h8_avg_sse2 - -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; - -// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, - sse2); - -// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, int y_step_q4, -// int w, int h, int bd); -// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const InterpKernel *filter, int x0_q4, -// int32_t x_step_q4, int y0_q4, -// int y_step_q4, int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_, sse2); -#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -#endif // HAVE_SSE2 diff --git a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm index e6e72b826..c57149657 100644 --- a/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm +++ b/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm @@ -199,7 +199,7 @@ SECTION .text -;void vpx_filter_block1d4_v8_sse2 +;void vpx_highbd_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -269,7 +269,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2): pop rbp ret -;void vpx_filter_block1d8_v8_sse2 +;void vpx_highbd_filter_block1d8_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -328,7 +328,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2): pop rbp ret -;void vpx_filter_block1d16_v8_sse2 +;void vpx_highbd_filter_block1d16_v8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pitch, @@ -554,7 +554,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2): pop rbp ret -;void vpx_filter_block1d4_h8_sse2 +;void vpx_highbd_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -629,7 +629,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2): pop rbp ret -;void vpx_filter_block1d8_h8_sse2 +;void vpx_highbd_filter_block1d8_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, @@ -695,7 +695,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2): pop rbp ret -;void vpx_filter_block1d16_h8_sse2 +;void vpx_highbd_filter_block1d16_h8_sse2 ;( ; unsigned char *src_ptr, ; unsigned int src_pixels_per_line, diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index 0be2c0fef..e40fe693a 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -19,9 +19,10 @@ #define CONV8_ROUNDING_BITS (7) #define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) -void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding @@ -100,9 +101,10 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, /* The macro used to generate functions shifts the src_ptr up by 3 rows already * */ -void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { // Register for source s[-1:3, :] __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; // Interleaved rows of the source. lo is first half, hi second @@ -255,9 +257,10 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding @@ -312,9 +315,10 @@ void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { // Register for source s[-1:3, :] __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; // Interleaved rows of the source. lo is first half, hi second @@ -430,9 +434,10 @@ void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding @@ -495,9 +500,10 @@ void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, - uint8_t *dst_ptr, ptrdiff_t dst_stride, - uint32_t height, const int16_t *kernel) { +static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, + ptrdiff_t src_stride, uint8_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel) { // Register for source s[-1:3, :] __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; // Interleaved rows of the source. lo is first half, hi second @@ -608,10 +614,10 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, } } -void vpx_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +static void vpx_highbd_filter_block1d4_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] @@ -670,10 +676,9 @@ void vpx_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d4_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load two rows of pixels as 16-bit words, and shuffle them into the // form // ... s[0,1] s[-1,1] s[0,0] s[-1,0] @@ -774,10 +779,9 @@ void vpx_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d8_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] @@ -852,10 +856,9 @@ void vpx_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d8_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load two rows of pixels as 16-bit words, and shuffle them into the // form // ... s[0,1] s[-1,1] s[0,0] s[-1,0] @@ -982,24 +985,194 @@ void vpx_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, } } -void vpx_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d16_h4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, height, kernel, bd); vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, dst_stride, height, kernel, bd); } -void vpx_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, - ptrdiff_t src_stride, - uint16_t *dst_ptr, - ptrdiff_t dst_stride, uint32_t height, - const int16_t *kernel, int bd) { +static void vpx_highbd_filter_block1d16_v4_sse2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, height, kernel, bd); vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, dst_stride, height, kernel, bd); } +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 + +// From vpx_subpixel_8t_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v8_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_sse2; +filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2 +#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2 +#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2 +#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2 +#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2 +#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm. +filter8_1dfunction vpx_filter_block1d16_v2_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_sse2; +filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2; +filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; + +// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); + +// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_, sse2); + +#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 +// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; + +// Use the [vh]8 version because there is no [vh]4 implementation. +#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ + vpx_highbd_filter_block1d16_v8_avg_sse2 +#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ + vpx_highbd_filter_block1d16_h8_avg_sse2 +#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ + vpx_highbd_filter_block1d8_v8_avg_sse2 +#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ + vpx_highbd_filter_block1d8_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 + +// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm. +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; + +// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, + sse2); + +// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, +// int w, int h, int bd); +// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2); +HIGH_FUN_CONV_2D(avg_, sse2); +#endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 -- 2.40.0