From 40ab0424d4884190a3450b1e7edf8155c60f011d Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Thu, 27 Oct 2016 16:06:07 -0700 Subject: [PATCH] Add high bitdepth intra prediction NEON optimization (mode d45 and d135) BUG=webm:1316 Change-Id: I6a330874348df04df24a6d9efdc06f567e04bf8e --- test/test_intra_pred_speed.cc | 24 +- test/vp9_intrapred_test.cc | 48 ++++ vpx_dsp/arm/highbd_intrapred_neon.c | 410 ++++++++++++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 + 4 files changed, 482 insertions(+), 8 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index f8efc18dd..f23e946f0 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -480,26 +480,34 @@ HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon, vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, - vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL) + vpx_highbd_dc_128_predictor_4x4_neon, NULL, NULL, + vpx_highbd_d45_predictor_4x4_neon, + vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, + NULL, NULL) HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, - vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL) + vpx_highbd_dc_128_predictor_8x8_neon, NULL, NULL, + vpx_highbd_d45_predictor_8x8_neon, + vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, + NULL, NULL) HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, vpx_highbd_dc_left_predictor_16x16_neon, vpx_highbd_dc_top_predictor_16x16_neon, - vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL) + vpx_highbd_dc_128_predictor_16x16_neon, NULL, NULL, + vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, + NULL, NULL) HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, vpx_highbd_dc_left_predictor_32x32_neon, vpx_highbd_dc_top_predictor_32x32_neon, - vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL) + vpx_highbd_dc_128_predictor_32x32_neon, NULL, NULL, + vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, + NULL, NULL) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 87163d0f6..d573baef6 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -416,6 +416,22 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON_TO_C_8, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -452,6 +468,22 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON_TO_C_10, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, @@ -488,6 +520,22 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON_TO_C_12, VP9HighbdIntraPredTest, ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_neon, + &vpx_highbd_d45_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_neon, + &vpx_highbd_d45_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_neon, + &vpx_highbd_d45_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon, + &vpx_highbd_d45_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon, + &vpx_highbd_d135_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon, + &vpx_highbd_d135_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_neon, + &vpx_highbd_d135_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon, + &vpx_highbd_d135_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon, &vpx_highbd_dc_128_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index 6da1b9f3b..9177fb45d 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -283,3 +283,413 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, (void)left; dc_store_32x32(dst, stride, dc); } + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t ABCDEFGH = vld1q_u16(above); + const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1); + const uint16x8_t CDEFGH00 = vld1q_u16(above + 2); + const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00); + const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0); + const uint16x4_t avg2_low = vget_low_u16(avg2); + const uint16x4_t avg2_high = vget_high_u16(avg2); + const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1); + const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2); + const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3); + (void)left; + (void)bd; + vst1_u16(dst, avg2_low); + dst += stride; + vst1_u16(dst, r1); + dst += stride; + vst1_u16(dst, r2); + dst += stride; + vst1_u16(dst, r3); + vst1q_lane_u16(dst + 3, ABCDEFGH, 7); +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t above_right, uint16x8_t *row) { + *row = vextq_u16(*row, above_right, 1); + vst1q_u16(*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0 = vld1q_u16(above); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3); + const uint16x8_t A1 = vld1q_u16(above + 1); + const uint16x8_t A2 = vld1q_u16(above + 2); + const uint16x8_t avg1 = vhaddq_u16(A0, A2); + uint16x8_t row = vrhaddq_u16(avg1, A1); + (void)left; + (void)bd; + + vst1q_u16(dst, row); + dst += stride; + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + vst1q_u16(dst, above_right); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t above_right, uint16x8_t *row_0, + uint16x8_t *row_1) { + *row_0 = vextq_u16(*row_0, *row_1, 1); + *row_1 = vextq_u16(*row_1, above_right, 1); + vst1q_u16(*dst, *row_0); + *dst += 8; + vst1q_u16(*dst, *row_1); + *dst += stride - 8; +} + +void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0_0 = vld1q_u16(above); + const uint16x8_t A0_1 = vld1q_u16(above + 8); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3); + const uint16x8_t A1_0 = vld1q_u16(above + 1); + const uint16x8_t A1_1 = vld1q_u16(above + 9); + const uint16x8_t A2_0 = vld1q_u16(above + 2); + const uint16x8_t A2_1 = vld1q_u16(above + 10); + const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); + const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); + uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); + uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + (void)left; + (void)bd; + + vst1q_u16(dst, row_0); + vst1q_u16(dst + 8, row_1); + dst += stride; + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + vst1q_u16(dst, above_right); + vst1q_u16(dst + 8, above_right); +} + +void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0_0 = vld1q_u16(above); + const uint16x8_t A0_1 = vld1q_u16(above + 8); + const uint16x8_t A0_2 = vld1q_u16(above + 16); + const uint16x8_t A0_3 = vld1q_u16(above + 24); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3); + const uint16x8_t A1_0 = vld1q_u16(above + 1); + const uint16x8_t A1_1 = vld1q_u16(above + 9); + const uint16x8_t A1_2 = vld1q_u16(above + 17); + const uint16x8_t A1_3 = vld1q_u16(above + 25); + const uint16x8_t A2_0 = vld1q_u16(above + 2); + const uint16x8_t A2_1 = vld1q_u16(above + 10); + const uint16x8_t A2_2 = vld1q_u16(above + 18); + const uint16x8_t A2_3 = vld1q_u16(above + 26); + const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); + const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); + const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2); + const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3); + uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); + uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2); + uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3); + int i; + (void)left; + (void)bd; + + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst += stride - 24; + + for (i = 0; i < 30; ++i) { + row_0 = vextq_u16(row_0, row_1, 1); + row_1 = vextq_u16(row_1, row_2, 1); + row_2 = vextq_u16(row_2, row_3, 1); + row_3 = vextq_u16(row_3, above_right, 1); + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst += stride - 24; + } + + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123___ = vld1q_u16(above - 1); + const uint16x4_t L0123 = vld1_u16(left); + const uint16x4_t L3210 = vrev64_u16(L0123); + const uint16x8_t L____3210 = vcombine_u16(L0123, L3210); + const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___)); + const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5); + const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6); + const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_); + const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123); + const uint16x4_t row_0 = vget_low_u16(avg2); + const uint16x4_t row_1 = vget_high_u16(avg2); + const uint16x4_t r0 = vext_u16(row_0, row_1, 3); + const uint16x4_t r1 = vext_u16(row_0, row_1, 2); + const uint16x4_t r2 = vext_u16(row_0, row_1, 1); + (void)bd; + vst1_u16(dst, r0); + dst += stride; + vst1_u16(dst, r1); + dst += stride; + vst1_u16(dst, r2); + dst += stride; + vst1_u16(dst, row_0); +} + +void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A1234567_ = vld1q_u16(above + 1); + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567); + const uint16x8_t r0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r1 = vextq_u16(row_0, row_1, 6); + const uint16x8_t r2 = vextq_u16(row_0, row_1, 5); + const uint16x8_t r3 = vextq_u16(row_0, row_1, 4); + const uint16x8_t r4 = vextq_u16(row_0, row_1, 3); + const uint16x8_t r5 = vextq_u16(row_0, row_1, 2); + const uint16x8_t r6 = vextq_u16(row_0, row_1, 1); + (void)bd; + vst1q_u16(dst, r0); + dst += stride; + vst1q_u16(dst, r1); + dst += stride; + vst1q_u16(dst, r2); + dst += stride; + vst1q_u16(dst, r3); + dst += stride; + vst1q_u16(dst, r4); + dst += stride; + vst1q_u16(dst, r5); + dst += stride; + vst1q_u16(dst, r6); + dst += stride; + vst1q_u16(dst, row_0); +} + +static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row_0, + const uint16x8_t row_1) { + vst1q_u16(*dst, row_0); + *dst += 8; + vst1q_u16(*dst, row_1); + *dst += stride - 8; +} + +void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x8_t L89abcdef = vld1q_u16(left + 8); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef)); + const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98); + const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1); + const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987); + + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X); + + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678); + const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567); + + const uint16x8_t A789abcde = vld1q_u16(above + 7); + const uint16x8_t A89abcdef = vld1q_u16(above + 8); + const uint16x8_t A9abcdef_ = vld1q_u16(above + 9); + const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_); + const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef); + + const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7); + const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7); + const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6); + const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6); + const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5); + const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5); + const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4); + const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4); + const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3); + const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3); + const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2); + const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2); + const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1); + const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1); + const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6); + const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5); + const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4); + const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3); + const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2); + const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1); + (void)bd; + + d135_store_16(&dst, stride, r0_0, r0_1); + d135_store_16(&dst, stride, r1_0, r1_1); + d135_store_16(&dst, stride, r2_0, r2_1); + d135_store_16(&dst, stride, r3_0, r3_1); + d135_store_16(&dst, stride, r4_0, r4_1); + d135_store_16(&dst, stride, r5_0, r5_1); + d135_store_16(&dst, stride, r6_0, r6_1); + d135_store_16(&dst, stride, row_1, row_2); + d135_store_16(&dst, stride, r8_0, r0_0); + d135_store_16(&dst, stride, r9_0, r1_0); + d135_store_16(&dst, stride, ra_0, r2_0); + d135_store_16(&dst, stride, rb_0, r3_0); + d135_store_16(&dst, stride, rc_0, r4_0); + d135_store_16(&dst, stride, rd_0, r5_0); + d135_store_16(&dst, stride, re_0, r6_0); + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); +} + +void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t LL01234567 = vld1q_u16(left + 16); + const uint16x8_t LL89abcdef = vld1q_u16(left + 24); + const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567)); + const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567)); + const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef)); + const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef)); + const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210); + const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98); + const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1); + const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876); + uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987); + + const uint16x8_t LU01234567 = vld1q_u16(left); + const uint16x8_t LU89abcdef = vld1q_u16(left + 8); + const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567)); + const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567)); + const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef)); + const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef)); + const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210); + const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98); + const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1); + const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2); + const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe); + uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf); + + const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1); + const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2); + const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876); + uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987); + + const uint16x8_t XAL0123456 = vld1q_u16(above - 1); + const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1); + const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2); + const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0); + uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X); + + const uint16x8_t AL01234567 = vld1q_u16(above); + const uint16x8_t AL12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678); + uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567); + + const uint16x8_t AL789abcde = vld1q_u16(above + 7); + const uint16x8_t AL89abcdef = vld1q_u16(above + 8); + const uint16x8_t AL9abcdefg = vld1q_u16(above + 9); + const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg); + uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef); + + const uint16x8_t ALfR0123456 = vld1q_u16(above + 15); + const uint16x8_t AR01234567 = vld1q_u16(above + 16); + const uint16x8_t AR12345678 = vld1q_u16(above + 17); + const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678); + uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567); + + const uint16x8_t AR789abcde = vld1q_u16(above + 23); + const uint16x8_t AR89abcdef = vld1q_u16(above + 24); + const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25); + const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_); + uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef); + int i, j; + (void)bd; + + dst += 31 * stride; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 8; ++j) { + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst -= stride + 24; + row_0 = vextq_u16(row_0, row_1, 1); + row_1 = vextq_u16(row_1, row_2, 1); + row_2 = vextq_u16(row_2, row_3, 1); + row_3 = vextq_u16(row_3, row_4, 1); + row_4 = vextq_u16(row_4, row_4, 1); + } + row_4 = row_5; + row_5 = row_6; + row_6 = row_7; + } +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index da4b5ca79..e884bd321 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -211,6 +211,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_4x4 neon/; add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -223,6 +224,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_4x4 neon/; add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -249,6 +251,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_8x8 neon/; add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -261,6 +264,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_8x8 neon/; add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -287,6 +291,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_16x16 neon/; add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -299,6 +304,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_16x16 neon/; add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -325,6 +331,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_32x32 neon/; add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -337,6 +344,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_32x32 neon/; add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; -- 2.40.0