From: Linfeng Zhang Date: Fri, 28 Oct 2016 22:04:45 +0000 (-0700) Subject: Add high bitdepth intra prediction NEON optimization (mode tm) X-Git-Tag: v1.6.1~93^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=85c1ee434d51994844125a18bf6b98fd0510b827;p=libvpx Add high bitdepth intra prediction NEON optimization (mode tm) BUG=webm:1316 Change-Id: Ib014de06836ac12726f4a2c9f0833ec4eb4d233b --- diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index a1adef297..91a748182 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -481,27 +481,35 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon, vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon, vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon, - vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, NULL) + vpx_highbd_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL, + vpx_highbd_tm_predictor_4x4_neon) HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon, vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon, vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon, vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon, - vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL) -HIGHBD_INTRA_PRED_TEST( - NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon, - vpx_highbd_dc_left_predictor_16x16_neon, - vpx_highbd_dc_top_predictor_16x16_neon, - vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon, - vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon, - vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL) -HIGHBD_INTRA_PRED_TEST( - NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon, - vpx_highbd_dc_left_predictor_32x32_neon, - vpx_highbd_dc_top_predictor_32x32_neon, - vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon, - vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon, - vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL) + vpx_highbd_d135_predictor_8x8_neon, NULL, NULL, NULL, NULL, + vpx_highbd_tm_predictor_8x8_neon) +HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16, + vpx_highbd_dc_predictor_16x16_neon, + vpx_highbd_dc_left_predictor_16x16_neon, + vpx_highbd_dc_top_predictor_16x16_neon, + vpx_highbd_dc_128_predictor_16x16_neon, + vpx_highbd_v_predictor_16x16_neon, + vpx_highbd_h_predictor_16x16_neon, + vpx_highbd_d45_predictor_16x16_neon, + vpx_highbd_d135_predictor_16x16_neon, NULL, NULL, NULL, + NULL, vpx_highbd_tm_predictor_16x16_neon) +HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32, + vpx_highbd_dc_predictor_32x32_neon, + vpx_highbd_dc_left_predictor_32x32_neon, + vpx_highbd_dc_top_predictor_32x32_neon, + vpx_highbd_dc_128_predictor_32x32_neon, + vpx_highbd_v_predictor_32x32_neon, + vpx_highbd_h_predictor_32x32_neon, + vpx_highbd_d45_predictor_32x32_neon, + vpx_highbd_d135_predictor_32x32_neon, NULL, NULL, NULL, + NULL, vpx_highbd_tm_predictor_32x32_neon) #endif // HAVE_NEON #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 74107be08..ada6052b5 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -472,6 +472,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_h_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, &vpx_highbd_h_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, &vpx_highbd_v_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, @@ -540,6 +548,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_h_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, &vpx_highbd_h_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, &vpx_highbd_v_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, @@ -608,6 +624,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_h_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_neon, &vpx_highbd_h_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_neon, + &vpx_highbd_tm_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_neon, + &vpx_highbd_tm_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_16x16_neon, + &vpx_highbd_tm_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_neon, + &vpx_highbd_tm_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_neon, &vpx_highbd_v_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_neon, diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c index ea959586e..6f7e5da76 100644 --- a/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/vpx_dsp/arm/highbd_intrapred_neon.c @@ -895,3 +895,184 @@ void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, h_store_32(&dst, stride, row); } } + +// ----------------------------------------------------------------------------- + +void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x4_t above_s16d = vld1_s16((const int16_t *)above); + const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d); + const int16x4_t left_s16 = vld1_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x8_t sum; + uint16x8_t row; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); +} + +static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub, + const int16x8_t max) { + uint16x8_t row; + int16x8_t sum = vaddq_s16(left_dup, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1q_u16(*dst, row); + *dst += stride; +} + +void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above_s16 = vld1q_s16((const int16_t *)above); + const int16x8_t left_s16 = vld1q_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x4_t left_s16d; + int16x8_t left_dup; + int i; + + left_s16d = vget_low_s16(left_s16); + + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub, max); + } +} + +static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t max) { + uint16x8_t row0, row1; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += stride - 8; +} + +void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + int16x8_t left_dup; + int i, j; + + for (j = 0; j < 2; j++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + } + } +} + +static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3, const int16x8_t max) { + uint16x8_t row0, row1, row2, row3; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + int16x8_t sum2 = vaddq_s16(left_dup, sub2); + int16x8_t sum3 = vaddq_s16(left_dup, sub3); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + sum2 = vminq_s16(sum2, max); + sum3 = vminq_s16(sum3, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + row2 = vqshluq_n_s16(sum2, 0); + row3 = vqshluq_n_s16(sum3, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += 8; + vst1q_u16(*dst, row2); + *dst += 8; + vst1q_u16(*dst, row3); + *dst += stride - 24; +} + +void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16)); + const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + const int16x8_t sub2 = vsubq_s16(above2, top_left); + const int16x8_t sub3 = vsubq_s16(above3, top_left); + int16x8_t left_dup; + int i, j; + + for (i = 0; i < 4; i++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + } + } +} diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 10e632a84..0dbfe4e60 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -233,7 +233,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_4x4 sse2/; + specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/; @@ -274,7 +274,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_8x8 sse2/; + specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/; @@ -315,7 +315,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_16x16 sse2/; + specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/; @@ -356,7 +356,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_32x32 sse2/; + specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;