From 51dc998f3a2d04b9aa293cff000be34c5eaa5b9d Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 19 Apr 2017 13:08:25 -0700 Subject: [PATCH] Update highbd convolve functions arguments to use uint16_t src/dst BUG=webm:1388 Change-Id: I6912de2639895d817ce850da8ea9f6c8fe21da42 --- test/convolve_test.cc | 15 ++-- vp9/common/vp9_reconinter.h | 5 +- vp9/encoder/vp9_encoder.c | 10 +-- vp9/encoder/vp9_pickmode.c | 14 ++-- vp9/encoder/vp9_rdopt.c | 5 +- vpx_dsp/arm/highbd_vpx_convolve8_neon.c | 38 ++++------ vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c | 7 +- vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c | 7 +- vpx_dsp/arm/highbd_vpx_convolve_neon.c | 32 ++++---- vpx_dsp/vpx_convolve.c | 83 +++++++++------------ vpx_dsp/vpx_convolve.h | 4 +- vpx_dsp/vpx_dsp_rtcd_defs.pl | 16 ++-- vpx_dsp/x86/convolve.h | 67 ++++++++--------- vpx_dsp/x86/highbd_convolve_avx2.c | 12 +-- 14 files changed, 136 insertions(+), 179 deletions(-) diff --git a/test/convolve_test.cc b/test/convolve_test.cc index a8bab4082..8b9d081a0 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -917,13 +917,14 @@ TEST_P(ConvolveTest, CheckScalingFiltering) { using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -#define WRAP(func, bd) \ - void wrap_##func##_##bd( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \ - const int16_t *filter_y, int filter_y_stride, int w, int h) { \ - vpx_highbd_##func(src, src_stride, dst, dst_stride, filter_x, \ - filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ +#define WRAP(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \ + const int16_t *filter_y, int filter_y_stride, int w, int h) { \ + vpx_highbd_##func(reinterpret_cast(src), src_stride, \ + reinterpret_cast(dst), dst_stride, filter_x, \ + filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ } #if HAVE_SSE2 && ARCH_X86_64 diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index cb7d1c63a..f4d97afda 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -37,9 +37,8 @@ static INLINE void highbd_inter_predictor( const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src)), src_stride, - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)), dst_stride, kernel[subpel_x], - xs, kernel[subpel_y], ys, w, h, bd); + CONVERT_TO_SHORTPTR(src), src_stride, CONVERT_TO_SHORTPTR(dst), + dst_stride, kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index d82b706be..b1b4ba0e0 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2418,11 +2418,11 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); if (src->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_convolve8( - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(src_ptr)), src_stride, - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst_ptr)), dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, kernel[y_q4 & 0xf], - 16 * src_h / dst_h, 16 / factor, 16 / factor, bd); + vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride, + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, + kernel[x_q4 & 0xf], 16 * src_w / dst_w, + kernel[y_q4 & 0xf], 16 * src_h / dst_h, + 16 / factor, 16 / factor, bd); } else { vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel[x_q4 & 0xf], 16 * src_w / dst_w, diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index c82930f56..70cb4467d 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -2078,10 +2078,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) vpx_highbd_convolve_copy( - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)), - best_pred->stride, - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(this_mode_pred->data)), - this_mode_pred->stride, NULL, 0, NULL, 0, bw, bh, xd->bd); + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride, + NULL, 0, NULL, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, @@ -2189,10 +2188,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) vpx_highbd_convolve_copy( - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(best_pred->data)), - best_pred->stride, - CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(pd->dst.buf)), pd->dst.stride, - NULL, 0, NULL, 0, bw, bh, xd->bd); + CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0, + bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, pd->dst.stride, NULL, 0, NULL, 0, bw, bh); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 2278ddc0f..5e566e1da 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -599,9 +599,8 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_convolve_copy(CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)), - dst_stride, recon, 32, NULL, 0, NULL, 0, bs, - bs, xd->bd); + vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, + 32, NULL, 0, NULL, 0, bs, bs, xd->bd); recon = CONVERT_TO_BYTEPTR(recon16); if (xd->lossless) { vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index a00aa0444..74345e1fa 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -135,18 +135,16 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, return d; } -void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x, + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_x); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -336,20 +334,17 @@ void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, +void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, bd); + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_x); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -569,18 +564,16 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, } } -void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, // unused int x_step_q4, // unused const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x, + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_y); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); @@ -736,20 +729,17 @@ void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, +void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, // unused int x_step_q4, // unused const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h, bd); + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } else { - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); const int16x8_t filters = vld1q_s16(filter_y); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); diff --git a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c index b244caea9..4ff3dea08 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -13,14 +13,11 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); - (void)filter_x; (void)filter_x_stride; (void)filter_y; diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 9401e7b8c..61712d48e 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -13,14 +13,11 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); - (void)filter_x; (void)filter_x_stride; (void)filter_y; diff --git a/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_neon.c index 03a36e4a0..f769620a4 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -13,12 +13,11 @@ #include "vpx_dsp/vpx_filter.h" #include "vpx_ports/mem.h" -void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CAST_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); @@ -29,22 +28,20 @@ void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ - vpx_highbd_convolve8_horiz_neon( - CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp), - w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, + filter_x, x_step_q4, filter_y, y_step_q4, w, + intermediate_height, bd); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst, - dst_stride, filter_x, x_step_q4, filter_y, - y_step_q4, w, h, bd); + vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const uint16_t *src = CAST_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); @@ -54,10 +51,9 @@ void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vpx_highbd_convolve8_horiz_neon( - CAST_TO_BYTEPTR(src - src_stride * 3), src_stride, CAST_TO_BYTEPTR(temp), - w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(CAST_TO_BYTEPTR(temp + w * 3), w, dst, - dst_stride, filter_x, x_step_q4, filter_y, - y_step_q4, w, h, bd); + vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, + filter_x, x_step_q4, filter_y, y_step_q4, w, + intermediate_height, bd); + vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); } diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c index 5a62836eb..02c5a955a 100644 --- a/vpx_dsp/vpx_convolve.c +++ b/vpx_dsp/vpx_convolve.c @@ -319,13 +319,11 @@ void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } #if CONFIG_VP9_HIGHBITDEPTH -static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -343,13 +341,11 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_horiz(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { @@ -369,13 +365,11 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -395,13 +389,11 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -423,8 +415,8 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4, int x_step_q4, const InterpKernel *const y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { @@ -449,16 +441,15 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, assert(y_step_q4 <= 32); assert(x_step_q4 <= 32); - highbd_convolve_horiz(CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) - - src_stride * (SUBPEL_TAPS / 2 - 1)), - src_stride, CAST_TO_BYTEPTR(temp), 64, x_filters, x0_q4, - x_step_q4, w, intermediate_height, bd); - highbd_convolve_vert(CAST_TO_BYTEPTR(temp + 64 * (SUBPEL_TAPS / 2 - 1)), 64, - dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); + highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, 64, x_filters, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, + y_filters, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -472,8 +463,8 @@ void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -487,8 +478,8 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x_step_q4, w, h, bd); } -void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -502,8 +493,8 @@ void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -517,8 +508,8 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -531,8 +522,8 @@ void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, filters_y, y0_q4, y_step_q4, w, h, bd); } -void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { @@ -541,20 +532,18 @@ void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, assert(w <= 64); assert(h <= 64); - vpx_highbd_convolve8_c(src, src_stride, CAST_TO_BYTEPTR(temp), 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(CAST_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, 0, - NULL, 0, w, h, bd); + vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, + filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h, + bd); } -void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int r; - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; @@ -569,14 +558,12 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int x, y; - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h index ee9744b3a..1aedd32bd 100644 --- a/vpx_dsp/vpx_convolve.h +++ b/vpx_dsp/vpx_convolve.h @@ -24,8 +24,8 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, int h); #if CONFIG_VP9_HIGHBITDEPTH -typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, +typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index ed0339cbe..667352a64 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -372,28 +372,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Sub Pixel Filters # - add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/; - add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_horiz neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_vert neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_avg neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_avg_horiz neon/, "$sse2_x86_64"; - add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/vpx_highbd_convolve8_avg_vert neon/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index ea7016416..e69d6c617 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -103,12 +103,10 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, #define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ void vpx_highbd_convolve8_##name##_##opt( \ - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ if (step_q4 == 16 && filter[3] != 128) { \ - uint16_t *src = CAST_TO_SHORTPTR(src8); \ - uint16_t *dst = CAST_TO_SHORTPTR(dst8); \ if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ @@ -156,43 +154,42 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } \ if (w) { \ - vpx_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ filter_x, x_step_q4, filter_y, \ y_step_q4, w, h, bd); \ } \ } -#define HIGH_FUN_CONV_2D(avg, opt) \ - void vpx_highbd_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt( \ - CAST_TO_BYTEPTR(CAST_TO_SHORTPTR(src) - 3 * src_stride), \ - src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CAST_TO_BYTEPTR(fdata2 + 192), 64, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt( \ - src, src_stride, CAST_TO_BYTEPTR(fdata2), 64, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - CAST_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h, bd); \ - } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ +#define HIGH_FUN_CONV_2D(avg, opt) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, 64, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index 2b774bf23..fe0fa294a 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -16,13 +16,11 @@ // ----------------------------------------------------------------------------- // Copy and average -void vpx_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int width, int h, int bd) { - const uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_y; (void)filter_x_stride; @@ -99,13 +97,11 @@ void vpx_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, } } -void vpx_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, +void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int width, int h, int bd) { - uint16_t *src = CAST_TO_SHORTPTR(src8); - uint16_t *dst = CAST_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_y; (void)filter_x_stride; -- 2.40.0