From 6a82f0d7fb9ee908c389e8d55444bbaed3d54e9c Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 5 Jun 2015 09:54:19 -0700 Subject: [PATCH] Move sub pixel variance to vpx_dsp Change-Id: I66bf6720c396c89aa2d1fd26d5d52bf5d5e3dff1 --- test/variance_test.cc | 1614 +++++++------- .../arm/neon/vp8_subpixelvariance_neon.c | 1017 --------- vp8/common/arm/variance_arm.c | 137 -- vp8/common/mfqe.c | 2 +- vp8/common/rtcd_defs.pl | 41 - vp8/common/variance.h | 92 - vp8/common/variance_c.c | 337 --- vp8/common/x86/variance_impl_sse2.asm | 972 --------- vp8/common/x86/variance_impl_ssse3.asm | 364 ---- vp8/common/x86/variance_ssse3.c | 157 -- vp8/common/x86/vp8_variance_impl_mmx.asm | 353 --- vp8/common/x86/vp8_variance_mmx.c | 244 --- vp8/common/x86/vp8_variance_sse2.c | 403 ---- vp8/encoder/firstpass.c | 2 +- vp8/encoder/mcomp.h | 2 +- vp8/encoder/onyx_if.c | 16 +- vp8/encoder/onyx_int.h | 2 +- vp8/encoder/pickinter.c | 2 +- vp8/encoder/rdopt.c | 6 +- vp8/vp8_common.mk | 13 - vp9/common/mips/msa/vp9_convolve_avg_msa.c | 2 +- vp9/common/mips/msa/vp9_convolve_copy_msa.c | 2 +- vp9/common/mips/msa/vp9_convolve_msa.h | 2 +- vp9/common/mips/msa/vp9_idct_msa.h | 2 +- vp9/common/mips/msa/vp9_intra_predict_msa.c | 2 +- vp9/common/mips/msa/vp9_loopfilter_msa.h | 2 +- vp9/common/mips/msa/vp9_macros_msa.h | 1885 ----------------- vp9/common/mips/msa/vp9_mfqe_msa.c | 2 +- vp9/common/vp9_rtcd_defs.pl | 317 --- vp9/encoder/mips/msa/vp9_avg_msa.c | 2 +- vp9/encoder/mips/msa/vp9_error_msa.c | 2 +- vp9/encoder/mips/msa/vp9_fdct_msa.h | 2 +- .../mips/msa/vp9_temporal_filter_msa.c | 2 +- vp9/encoder/vp9_encoder.c | 208 +- vp9/encoder/vp9_encoder.h | 2 +- vp9/encoder/vp9_firstpass.c | 10 +- vp9/encoder/vp9_mcomp.h | 2 +- vp9/encoder/vp9_rd.c | 1 - vp9/encoder/vp9_rdopt.c | 1 - vp9/encoder/vp9_variance.c | 380 ---- vp9/encoder/vp9_variance.h | 81 - vp9/encoder/x86/vp9_highbd_variance_sse2.c | 349 --- .../vp9_subpel_variance_impl_intrin_avx2.c | 525 ----- vp9/encoder/x86/vp9_variance_avx2.c | 104 - vp9/encoder/x86/vp9_variance_sse2.c | 182 -- vp9/vp9_common.mk | 1 - vp9/vp9cx.mk | 12 - vpx_dsp/arm/bilinear_filter_media.asm | 237 +++ vpx_dsp/arm/subpel_variance_media.c | 105 + .../arm/subpel_variance_neon.c | 23 +- .../arm/variance_halfpixvar16x16_h_media.asm | 4 +- .../arm/variance_halfpixvar16x16_hv_media.asm | 4 +- .../arm/variance_halfpixvar16x16_v_media.asm | 4 +- vpx_dsp/mips/macros_msa.h | 1517 ++++++++++++- .../mips/sub_pixel_variance_msa.c | 41 +- vpx_dsp/variance.c | 495 ++++- vpx_dsp/variance.h | 94 + vpx_dsp/vpx_dsp.mk | 20 +- vpx_dsp/vpx_dsp_rtcd_defs.pl | 320 ++- .../x86/highbd_subpel_variance_impl_sse2.asm | 4 +- vpx_dsp/x86/highbd_variance_sse2.c | 340 ++- .../x86/subpel_variance_sse2.asm | 4 +- vpx_dsp/x86/variance_avx2.c | 90 + vpx_dsp/x86/variance_impl_avx2.c | 512 +++++ vpx_dsp/x86/variance_impl_mmx.asm | 342 ++- vpx_dsp/x86/variance_mmx.c | 166 +- vpx_dsp/x86/variance_sse2.c | 168 ++ 67 files changed, 5171 insertions(+), 9177 deletions(-) delete mode 100644 vp8/common/arm/neon/vp8_subpixelvariance_neon.c delete mode 100644 vp8/common/arm/variance_arm.c delete mode 100644 vp8/common/variance.h delete mode 100644 vp8/common/variance_c.c delete mode 100644 vp8/common/x86/variance_impl_sse2.asm delete mode 100644 vp8/common/x86/variance_impl_ssse3.asm delete mode 100644 vp8/common/x86/variance_ssse3.c delete mode 100644 vp8/common/x86/vp8_variance_impl_mmx.asm delete mode 100644 vp8/common/x86/vp8_variance_mmx.c delete mode 100644 vp8/common/x86/vp8_variance_sse2.c delete mode 100644 vp9/common/mips/msa/vp9_macros_msa.h delete mode 100644 vp9/encoder/vp9_variance.c delete mode 100644 vp9/encoder/vp9_variance.h delete mode 100644 vp9/encoder/x86/vp9_highbd_variance_sse2.c delete mode 100644 vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c delete mode 100644 vp9/encoder/x86/vp9_variance_avx2.c delete mode 100644 vp9/encoder/x86/vp9_variance_sse2.c create mode 100644 vpx_dsp/arm/bilinear_filter_media.asm create mode 100644 vpx_dsp/arm/subpel_variance_media.c rename vp9/encoder/arm/neon/vp9_variance_neon.c => vpx_dsp/arm/subpel_variance_neon.c (90%) rename vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm => vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm (98%) rename vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm => vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm (98%) rename vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm => vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm (98%) rename vp9/encoder/mips/msa/vp9_variance_msa.c => vpx_dsp/mips/sub_pixel_variance_msa.c (96%) create mode 100644 vpx_dsp/variance.h rename vp9/encoder/x86/vp9_highbd_subpel_variance.asm => vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm (99%) rename vp9/encoder/x86/vp9_subpel_variance.asm => vpx_dsp/x86/subpel_variance_sse2.asm (99%) diff --git a/test/variance_test.cc b/test/variance_test.cc index c9dbcd469..64095bc03 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -21,13 +21,6 @@ #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" -#if CONFIG_VP8_ENCODER -# include "./vp8_rtcd.h" -#endif // CONFIG_VP8_ENCODER -#if CONFIG_VP9_ENCODER -# include "./vp9_rtcd.h" -# include "vp9/encoder/vp9_variance.h" -#endif // CONFIG_VP9_ENCODER #include "./vpx_dsp_rtcd.h" namespace { @@ -39,8 +32,15 @@ typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse); +typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + uint32_t *sse, + const uint8_t *second_pred); typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride); +typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); + using ::std::tr1::get; using ::std::tr1::make_tuple; @@ -166,8 +166,6 @@ static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src, (l2w + l2h))); } -typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); - class SumOfSquaresTest : public ::testing::TestWithParam { public: SumOfSquaresTest() : func_(GetParam()) {} @@ -687,9 +685,8 @@ void SubpelVarianceTest::ExtremeRefTest() { } } -#if CONFIG_VP9_ENCODER template<> -void SubpelVarianceTest::RefTest() { +void SubpelVarianceTest::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { if (!use_high_bit_depth_) { @@ -726,11 +723,12 @@ void SubpelVarianceTest::RefTest() { } } } -#endif // CONFIG_VP9_ENCODER typedef MseTest VpxSseTest; typedef MseTest VpxMseTest; typedef VarianceTest VpxVarianceTest; +typedef SubpelVarianceTest VpxSubpelVarianceTest; +typedef SubpelVarianceTest VpxSubpelAvgVarianceTest; TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); } TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); } @@ -742,6 +740,9 @@ TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); } TEST_P(SumOfSquaresTest, Const) { ConstTest(); } TEST_P(SumOfSquaresTest, Ref) { RefTest(); } +TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); } INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest, ::testing::Values(vpx_get_mb_ss_c)); @@ -773,7 +774,6 @@ const VarianceMxNFunc variance8x8_c = vpx_variance8x8_c; const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c; const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c; const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c; - INSTANTIATE_TEST_CASE_P( C, VpxVarianceTest, ::testing::Values(make_tuple(6, 6, variance64x64_c, 0), @@ -790,9 +790,79 @@ INSTANTIATE_TEST_CASE_P( make_tuple(2, 3, variance4x8_c, 0), make_tuple(2, 2, variance4x4_c, 0))); +const SubpixVarMxNFunc subpel_var64x64_c = vpx_sub_pixel_variance64x64_c; +const SubpixVarMxNFunc subpel_var64x32_c = vpx_sub_pixel_variance64x32_c; +const SubpixVarMxNFunc subpel_var32x64_c = vpx_sub_pixel_variance32x64_c; +const SubpixVarMxNFunc subpel_var32x32_c = vpx_sub_pixel_variance32x32_c; +const SubpixVarMxNFunc subpel_var32x16_c = vpx_sub_pixel_variance32x16_c; +const SubpixVarMxNFunc subpel_var16x32_c = vpx_sub_pixel_variance16x32_c; +const SubpixVarMxNFunc subpel_var16x16_c = vpx_sub_pixel_variance16x16_c; +const SubpixVarMxNFunc subpel_var16x8_c = vpx_sub_pixel_variance16x8_c; +const SubpixVarMxNFunc subpel_var8x16_c = vpx_sub_pixel_variance8x16_c; +const SubpixVarMxNFunc subpel_var8x8_c = vpx_sub_pixel_variance8x8_c; +const SubpixVarMxNFunc subpel_var8x4_c = vpx_sub_pixel_variance8x4_c; +const SubpixVarMxNFunc subpel_var4x8_c = vpx_sub_pixel_variance4x8_c; +const SubpixVarMxNFunc subpel_var4x4_c = vpx_sub_pixel_variance4x4_c; +INSTANTIATE_TEST_CASE_P( + C, VpxSubpelVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_var64x64_c, 0), + make_tuple(6, 5, subpel_var64x32_c, 0), + make_tuple(5, 6, subpel_var32x64_c, 0), + make_tuple(5, 5, subpel_var32x32_c, 0), + make_tuple(5, 4, subpel_var32x16_c, 0), + make_tuple(4, 5, subpel_var16x32_c, 0), + make_tuple(4, 4, subpel_var16x16_c, 0), + make_tuple(4, 3, subpel_var16x8_c, 0), + make_tuple(3, 4, subpel_var8x16_c, 0), + make_tuple(3, 3, subpel_var8x8_c, 0), + make_tuple(3, 2, subpel_var8x4_c, 0), + make_tuple(2, 3, subpel_var4x8_c, 0), + make_tuple(2, 2, subpel_var4x4_c, 0))); + +const SubpixAvgVarMxNFunc subpel_avg_var64x64_c = + vpx_sub_pixel_avg_variance64x64_c; +const SubpixAvgVarMxNFunc subpel_avg_var64x32_c = + vpx_sub_pixel_avg_variance64x32_c; +const SubpixAvgVarMxNFunc subpel_avg_var32x64_c = + vpx_sub_pixel_avg_variance32x64_c; +const SubpixAvgVarMxNFunc subpel_avg_var32x32_c = + vpx_sub_pixel_avg_variance32x32_c; +const SubpixAvgVarMxNFunc subpel_avg_var32x16_c = + vpx_sub_pixel_avg_variance32x16_c; +const SubpixAvgVarMxNFunc subpel_avg_var16x32_c = + vpx_sub_pixel_avg_variance16x32_c; +const SubpixAvgVarMxNFunc subpel_avg_var16x16_c = + vpx_sub_pixel_avg_variance16x16_c; +const SubpixAvgVarMxNFunc subpel_avg_var16x8_c = + vpx_sub_pixel_avg_variance16x8_c; +const SubpixAvgVarMxNFunc subpel_avg_var8x16_c = + vpx_sub_pixel_avg_variance8x16_c; +const SubpixAvgVarMxNFunc subpel_avg_var8x8_c = vpx_sub_pixel_avg_variance8x8_c; +const SubpixAvgVarMxNFunc subpel_avg_var8x4_c = vpx_sub_pixel_avg_variance8x4_c; +const SubpixAvgVarMxNFunc subpel_avg_var4x8_c = vpx_sub_pixel_avg_variance4x8_c; +const SubpixAvgVarMxNFunc subpel_avg_var4x4_c = vpx_sub_pixel_avg_variance4x4_c; +INSTANTIATE_TEST_CASE_P( + C, VpxSubpelAvgVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_avg_var64x64_c, 0), + make_tuple(6, 5, subpel_avg_var64x32_c, 0), + make_tuple(5, 6, subpel_avg_var32x64_c, 0), + make_tuple(5, 5, subpel_avg_var32x32_c, 0), + make_tuple(5, 4, subpel_avg_var32x16_c, 0), + make_tuple(4, 5, subpel_avg_var16x32_c, 0), + make_tuple(4, 4, subpel_avg_var16x16_c, 0), + make_tuple(4, 3, subpel_avg_var16x8_c, 0), + make_tuple(3, 4, subpel_avg_var8x16_c, 0), + make_tuple(3, 3, subpel_avg_var8x8_c, 0), + make_tuple(3, 2, subpel_avg_var8x4_c, 0), + make_tuple(2, 3, subpel_avg_var4x8_c, 0), + make_tuple(2, 2, subpel_avg_var4x4_c, 0))); + #if CONFIG_VP9_HIGHBITDEPTH typedef MseTest VpxHBDMseTest; typedef VarianceTest VpxHBDVarianceTest; +typedef SubpelVarianceTest VpxHBDSubpelVarianceTest; +typedef SubpelVarianceTest + VpxHBDSubpelAvgVarianceTest; TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); } TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); } @@ -800,6 +870,9 @@ TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); } TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); } TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); } TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } +TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); } +TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } /* TODO(debargha): This test does not support the highbd version const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c; @@ -844,7 +917,6 @@ const VarianceMxNFunc highbd_12_variance8x8_c = vpx_highbd_12_variance8x8_c; const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c; const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c; const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c; - const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c; const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c; const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c; @@ -858,7 +930,6 @@ const VarianceMxNFunc highbd_10_variance8x8_c = vpx_highbd_10_variance8x8_c; const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c; const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c; const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c; - const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c; const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c; const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c; @@ -913,6 +984,247 @@ INSTANTIATE_TEST_CASE_P( make_tuple(3, 2, highbd_8_variance8x4_c, 8), make_tuple(2, 3, highbd_8_variance4x8_c, 8), make_tuple(2, 2, highbd_8_variance4x4_c, 8))); + +const SubpixVarMxNFunc highbd_8_subpel_var64x64_c = + vpx_highbd_8_sub_pixel_variance64x64_c; +const SubpixVarMxNFunc highbd_8_subpel_var64x32_c = + vpx_highbd_8_sub_pixel_variance64x32_c; +const SubpixVarMxNFunc highbd_8_subpel_var32x64_c = + vpx_highbd_8_sub_pixel_variance32x64_c; +const SubpixVarMxNFunc highbd_8_subpel_var32x32_c = + vpx_highbd_8_sub_pixel_variance32x32_c; +const SubpixVarMxNFunc highbd_8_subpel_var32x16_c = + vpx_highbd_8_sub_pixel_variance32x16_c; +const SubpixVarMxNFunc highbd_8_subpel_var16x32_c = + vpx_highbd_8_sub_pixel_variance16x32_c; +const SubpixVarMxNFunc highbd_8_subpel_var16x16_c = + vpx_highbd_8_sub_pixel_variance16x16_c; +const SubpixVarMxNFunc highbd_8_subpel_var16x8_c = + vpx_highbd_8_sub_pixel_variance16x8_c; +const SubpixVarMxNFunc highbd_8_subpel_var8x16_c = + vpx_highbd_8_sub_pixel_variance8x16_c; +const SubpixVarMxNFunc highbd_8_subpel_var8x8_c = + vpx_highbd_8_sub_pixel_variance8x8_c; +const SubpixVarMxNFunc highbd_8_subpel_var8x4_c = + vpx_highbd_8_sub_pixel_variance8x4_c; +const SubpixVarMxNFunc highbd_8_subpel_var4x8_c = + vpx_highbd_8_sub_pixel_variance4x8_c; +const SubpixVarMxNFunc highbd_8_subpel_var4x4_c = + vpx_highbd_8_sub_pixel_variance4x4_c; +const SubpixVarMxNFunc highbd_10_subpel_var64x64_c = + vpx_highbd_10_sub_pixel_variance64x64_c; +const SubpixVarMxNFunc highbd_10_subpel_var64x32_c = + vpx_highbd_10_sub_pixel_variance64x32_c; +const SubpixVarMxNFunc highbd_10_subpel_var32x64_c = + vpx_highbd_10_sub_pixel_variance32x64_c; +const SubpixVarMxNFunc highbd_10_subpel_var32x32_c = + vpx_highbd_10_sub_pixel_variance32x32_c; +const SubpixVarMxNFunc highbd_10_subpel_var32x16_c = + vpx_highbd_10_sub_pixel_variance32x16_c; +const SubpixVarMxNFunc highbd_10_subpel_var16x32_c = + vpx_highbd_10_sub_pixel_variance16x32_c; +const SubpixVarMxNFunc highbd_10_subpel_var16x16_c = + vpx_highbd_10_sub_pixel_variance16x16_c; +const SubpixVarMxNFunc highbd_10_subpel_var16x8_c = + vpx_highbd_10_sub_pixel_variance16x8_c; +const SubpixVarMxNFunc highbd_10_subpel_var8x16_c = + vpx_highbd_10_sub_pixel_variance8x16_c; +const SubpixVarMxNFunc highbd_10_subpel_var8x8_c = + vpx_highbd_10_sub_pixel_variance8x8_c; +const SubpixVarMxNFunc highbd_10_subpel_var8x4_c = + vpx_highbd_10_sub_pixel_variance8x4_c; +const SubpixVarMxNFunc highbd_10_subpel_var4x8_c = + vpx_highbd_10_sub_pixel_variance4x8_c; +const SubpixVarMxNFunc highbd_10_subpel_var4x4_c = + vpx_highbd_10_sub_pixel_variance4x4_c; +const SubpixVarMxNFunc highbd_12_subpel_var64x64_c = + vpx_highbd_12_sub_pixel_variance64x64_c; +const SubpixVarMxNFunc highbd_12_subpel_var64x32_c = + vpx_highbd_12_sub_pixel_variance64x32_c; +const SubpixVarMxNFunc highbd_12_subpel_var32x64_c = + vpx_highbd_12_sub_pixel_variance32x64_c; +const SubpixVarMxNFunc highbd_12_subpel_var32x32_c = + vpx_highbd_12_sub_pixel_variance32x32_c; +const SubpixVarMxNFunc highbd_12_subpel_var32x16_c = + vpx_highbd_12_sub_pixel_variance32x16_c; +const SubpixVarMxNFunc highbd_12_subpel_var16x32_c = + vpx_highbd_12_sub_pixel_variance16x32_c; +const SubpixVarMxNFunc highbd_12_subpel_var16x16_c = + vpx_highbd_12_sub_pixel_variance16x16_c; +const SubpixVarMxNFunc highbd_12_subpel_var16x8_c = + vpx_highbd_12_sub_pixel_variance16x8_c; +const SubpixVarMxNFunc highbd_12_subpel_var8x16_c = + vpx_highbd_12_sub_pixel_variance8x16_c; +const SubpixVarMxNFunc highbd_12_subpel_var8x8_c = + vpx_highbd_12_sub_pixel_variance8x8_c; +const SubpixVarMxNFunc highbd_12_subpel_var8x4_c = + vpx_highbd_12_sub_pixel_variance8x4_c; +const SubpixVarMxNFunc highbd_12_subpel_var4x8_c = + vpx_highbd_12_sub_pixel_variance4x8_c; +const SubpixVarMxNFunc highbd_12_subpel_var4x4_c = + vpx_highbd_12_sub_pixel_variance4x4_c; +INSTANTIATE_TEST_CASE_P( + C, VpxHBDSubpelVarianceTest, + ::testing::Values(make_tuple(6, 6, highbd_8_subpel_var64x64_c, 8), + make_tuple(6, 5, highbd_8_subpel_var64x32_c, 8), + make_tuple(5, 6, highbd_8_subpel_var32x64_c, 8), + make_tuple(5, 5, highbd_8_subpel_var32x32_c, 8), + make_tuple(5, 4, highbd_8_subpel_var32x16_c, 8), + make_tuple(4, 5, highbd_8_subpel_var16x32_c, 8), + make_tuple(4, 4, highbd_8_subpel_var16x16_c, 8), + make_tuple(4, 3, highbd_8_subpel_var16x8_c, 8), + make_tuple(3, 4, highbd_8_subpel_var8x16_c, 8), + make_tuple(3, 3, highbd_8_subpel_var8x8_c, 8), + make_tuple(3, 2, highbd_8_subpel_var8x4_c, 8), + make_tuple(2, 3, highbd_8_subpel_var4x8_c, 8), + make_tuple(2, 2, highbd_8_subpel_var4x4_c, 8), + make_tuple(6, 6, highbd_10_subpel_var64x64_c, 10), + make_tuple(6, 5, highbd_10_subpel_var64x32_c, 10), + make_tuple(5, 6, highbd_10_subpel_var32x64_c, 10), + make_tuple(5, 5, highbd_10_subpel_var32x32_c, 10), + make_tuple(5, 4, highbd_10_subpel_var32x16_c, 10), + make_tuple(4, 5, highbd_10_subpel_var16x32_c, 10), + make_tuple(4, 4, highbd_10_subpel_var16x16_c, 10), + make_tuple(4, 3, highbd_10_subpel_var16x8_c, 10), + make_tuple(3, 4, highbd_10_subpel_var8x16_c, 10), + make_tuple(3, 3, highbd_10_subpel_var8x8_c, 10), + make_tuple(3, 2, highbd_10_subpel_var8x4_c, 10), + make_tuple(2, 3, highbd_10_subpel_var4x8_c, 10), + make_tuple(2, 2, highbd_10_subpel_var4x4_c, 10), + make_tuple(6, 6, highbd_12_subpel_var64x64_c, 12), + make_tuple(6, 5, highbd_12_subpel_var64x32_c, 12), + make_tuple(5, 6, highbd_12_subpel_var32x64_c, 12), + make_tuple(5, 5, highbd_12_subpel_var32x32_c, 12), + make_tuple(5, 4, highbd_12_subpel_var32x16_c, 12), + make_tuple(4, 5, highbd_12_subpel_var16x32_c, 12), + make_tuple(4, 4, highbd_12_subpel_var16x16_c, 12), + make_tuple(4, 3, highbd_12_subpel_var16x8_c, 12), + make_tuple(3, 4, highbd_12_subpel_var8x16_c, 12), + make_tuple(3, 3, highbd_12_subpel_var8x8_c, 12), + make_tuple(3, 2, highbd_12_subpel_var8x4_c, 12), + make_tuple(2, 3, highbd_12_subpel_var4x8_c, 12), + make_tuple(2, 2, highbd_12_subpel_var4x4_c, 12))); + +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x64_c = + vpx_highbd_8_sub_pixel_avg_variance64x64_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x32_c = + vpx_highbd_8_sub_pixel_avg_variance64x32_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x64_c = + vpx_highbd_8_sub_pixel_avg_variance32x64_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x32_c = + vpx_highbd_8_sub_pixel_avg_variance32x32_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x16_c = + vpx_highbd_8_sub_pixel_avg_variance32x16_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x32_c = + vpx_highbd_8_sub_pixel_avg_variance16x32_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x16_c = + vpx_highbd_8_sub_pixel_avg_variance16x16_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x8_c = + vpx_highbd_8_sub_pixel_avg_variance16x8_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x16_c = + vpx_highbd_8_sub_pixel_avg_variance8x16_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x8_c = + vpx_highbd_8_sub_pixel_avg_variance8x8_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x4_c = + vpx_highbd_8_sub_pixel_avg_variance8x4_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x8_c = + vpx_highbd_8_sub_pixel_avg_variance4x8_c; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x4_c = + vpx_highbd_8_sub_pixel_avg_variance4x4_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x64_c = + vpx_highbd_10_sub_pixel_avg_variance64x64_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x32_c = + vpx_highbd_10_sub_pixel_avg_variance64x32_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x64_c = + vpx_highbd_10_sub_pixel_avg_variance32x64_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x32_c = + vpx_highbd_10_sub_pixel_avg_variance32x32_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x16_c = + vpx_highbd_10_sub_pixel_avg_variance32x16_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x32_c = + vpx_highbd_10_sub_pixel_avg_variance16x32_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x16_c = + vpx_highbd_10_sub_pixel_avg_variance16x16_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x8_c = + vpx_highbd_10_sub_pixel_avg_variance16x8_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x16_c = + vpx_highbd_10_sub_pixel_avg_variance8x16_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x8_c = + vpx_highbd_10_sub_pixel_avg_variance8x8_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x4_c = + vpx_highbd_10_sub_pixel_avg_variance8x4_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x8_c = + vpx_highbd_10_sub_pixel_avg_variance4x8_c; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x4_c = + vpx_highbd_10_sub_pixel_avg_variance4x4_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x64_c = + vpx_highbd_12_sub_pixel_avg_variance64x64_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x32_c = + vpx_highbd_12_sub_pixel_avg_variance64x32_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x64_c = + vpx_highbd_12_sub_pixel_avg_variance32x64_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x32_c = + vpx_highbd_12_sub_pixel_avg_variance32x32_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x16_c = + vpx_highbd_12_sub_pixel_avg_variance32x16_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x32_c = + vpx_highbd_12_sub_pixel_avg_variance16x32_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x16_c = + vpx_highbd_12_sub_pixel_avg_variance16x16_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x8_c = + vpx_highbd_12_sub_pixel_avg_variance16x8_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x16_c = + vpx_highbd_12_sub_pixel_avg_variance8x16_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x8_c = + vpx_highbd_12_sub_pixel_avg_variance8x8_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x4_c = + vpx_highbd_12_sub_pixel_avg_variance8x4_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x8_c = + vpx_highbd_12_sub_pixel_avg_variance4x8_c; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x4_c = + vpx_highbd_12_sub_pixel_avg_variance4x4_c; +INSTANTIATE_TEST_CASE_P( + C, VpxHBDSubpelAvgVarianceTest, + ::testing::Values( + make_tuple(6, 6, highbd_8_subpel_avg_var64x64_c, 8), + make_tuple(6, 5, highbd_8_subpel_avg_var64x32_c, 8), + make_tuple(5, 6, highbd_8_subpel_avg_var32x64_c, 8), + make_tuple(5, 5, highbd_8_subpel_avg_var32x32_c, 8), + make_tuple(5, 4, highbd_8_subpel_avg_var32x16_c, 8), + make_tuple(4, 5, highbd_8_subpel_avg_var16x32_c, 8), + make_tuple(4, 4, highbd_8_subpel_avg_var16x16_c, 8), + make_tuple(4, 3, highbd_8_subpel_avg_var16x8_c, 8), + make_tuple(3, 4, highbd_8_subpel_avg_var8x16_c, 8), + make_tuple(3, 3, highbd_8_subpel_avg_var8x8_c, 8), + make_tuple(3, 2, highbd_8_subpel_avg_var8x4_c, 8), + make_tuple(2, 3, highbd_8_subpel_avg_var4x8_c, 8), + make_tuple(2, 2, highbd_8_subpel_avg_var4x4_c, 8), + make_tuple(6, 6, highbd_10_subpel_avg_var64x64_c, 10), + make_tuple(6, 5, highbd_10_subpel_avg_var64x32_c, 10), + make_tuple(5, 6, highbd_10_subpel_avg_var32x64_c, 10), + make_tuple(5, 5, highbd_10_subpel_avg_var32x32_c, 10), + make_tuple(5, 4, highbd_10_subpel_avg_var32x16_c, 10), + make_tuple(4, 5, highbd_10_subpel_avg_var16x32_c, 10), + make_tuple(4, 4, highbd_10_subpel_avg_var16x16_c, 10), + make_tuple(4, 3, highbd_10_subpel_avg_var16x8_c, 10), + make_tuple(3, 4, highbd_10_subpel_avg_var8x16_c, 10), + make_tuple(3, 3, highbd_10_subpel_avg_var8x8_c, 10), + make_tuple(3, 2, highbd_10_subpel_avg_var8x4_c, 10), + make_tuple(2, 3, highbd_10_subpel_avg_var4x8_c, 10), + make_tuple(2, 2, highbd_10_subpel_avg_var4x4_c, 10), + make_tuple(6, 6, highbd_12_subpel_avg_var64x64_c, 12), + make_tuple(6, 5, highbd_12_subpel_avg_var64x32_c, 12), + make_tuple(5, 6, highbd_12_subpel_avg_var32x64_c, 12), + make_tuple(5, 5, highbd_12_subpel_avg_var32x32_c, 12), + make_tuple(5, 4, highbd_12_subpel_avg_var32x16_c, 12), + make_tuple(4, 5, highbd_12_subpel_avg_var16x32_c, 12), + make_tuple(4, 4, highbd_12_subpel_avg_var16x16_c, 12), + make_tuple(4, 3, highbd_12_subpel_avg_var16x8_c, 12), + make_tuple(3, 4, highbd_12_subpel_avg_var8x16_c, 12), + make_tuple(3, 3, highbd_12_subpel_avg_var8x8_c, 12), + make_tuple(3, 2, highbd_12_subpel_avg_var8x4_c, 12), + make_tuple(2, 3, highbd_12_subpel_avg_var4x8_c, 12), + make_tuple(2, 2, highbd_12_subpel_avg_var4x4_c, 12))); #endif // CONFIG_VP9_HIGHBITDEPTH #if HAVE_MMX @@ -935,6 +1247,19 @@ INSTANTIATE_TEST_CASE_P( make_tuple(3, 4, variance8x16_mmx, 0), make_tuple(3, 3, variance8x8_mmx, 0), make_tuple(2, 2, variance4x4_mmx, 0))); + +const SubpixVarMxNFunc subpel_var16x16_mmx = vpx_sub_pixel_variance16x16_mmx; +const SubpixVarMxNFunc subpel_var16x8_mmx = vpx_sub_pixel_variance16x8_mmx; +const SubpixVarMxNFunc subpel_var8x16_mmx = vpx_sub_pixel_variance8x16_mmx; +const SubpixVarMxNFunc subpel_var8x8_mmx = vpx_sub_pixel_variance8x8_mmx; +const SubpixVarMxNFunc subpel_var4x4_mmx = vpx_sub_pixel_variance4x4_mmx; +INSTANTIATE_TEST_CASE_P( + MMX, VpxSubpelVarianceTest, + ::testing::Values(make_tuple(4, 4, subpel_var16x16_mmx, 0), + make_tuple(4, 3, subpel_var16x8_mmx, 0), + make_tuple(3, 4, subpel_var8x16_mmx, 0), + make_tuple(3, 3, subpel_var8x8_mmx, 0), + make_tuple(2, 2, subpel_var4x4_mmx, 0))); #endif // HAVE_MMX #if HAVE_SSE2 @@ -979,6 +1304,90 @@ INSTANTIATE_TEST_CASE_P( make_tuple(3, 2, variance8x4_sse2, 0), make_tuple(2, 3, variance4x8_sse2, 0), make_tuple(2, 2, variance4x4_sse2, 0))); + +#if CONFIG_USE_X86INC +const SubpixVarMxNFunc subpel_variance64x64_sse2 = + vpx_sub_pixel_variance64x64_sse2; +const SubpixVarMxNFunc subpel_variance64x32_sse2 = + vpx_sub_pixel_variance64x32_sse2; +const SubpixVarMxNFunc subpel_variance32x64_sse2 = + vpx_sub_pixel_variance32x64_sse2; +const SubpixVarMxNFunc subpel_variance32x32_sse2 = + vpx_sub_pixel_variance32x32_sse2; +const SubpixVarMxNFunc subpel_variance32x16_sse2 = + vpx_sub_pixel_variance32x16_sse2; +const SubpixVarMxNFunc subpel_variance16x32_sse2 = + vpx_sub_pixel_variance16x32_sse2; +const SubpixVarMxNFunc subpel_variance16x16_sse2 = + vpx_sub_pixel_variance16x16_sse2; +const SubpixVarMxNFunc subpel_variance16x8_sse2 = + vpx_sub_pixel_variance16x8_sse2; +const SubpixVarMxNFunc subpel_variance8x16_sse2 = + vpx_sub_pixel_variance8x16_sse2; +const SubpixVarMxNFunc subpel_variance8x8_sse2 = vpx_sub_pixel_variance8x8_sse2; +const SubpixVarMxNFunc subpel_variance8x4_sse2 = vpx_sub_pixel_variance8x4_sse2; +const SubpixVarMxNFunc subpel_variance4x8_sse = vpx_sub_pixel_variance4x8_sse; +const SubpixVarMxNFunc subpel_variance4x4_sse = vpx_sub_pixel_variance4x4_sse; +INSTANTIATE_TEST_CASE_P( + SSE2, VpxSubpelVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_variance64x64_sse2, 0), + make_tuple(6, 5, subpel_variance64x32_sse2, 0), + make_tuple(5, 6, subpel_variance32x64_sse2, 0), + make_tuple(5, 5, subpel_variance32x32_sse2, 0), + make_tuple(5, 4, subpel_variance32x16_sse2, 0), + make_tuple(4, 5, subpel_variance16x32_sse2, 0), + make_tuple(4, 4, subpel_variance16x16_sse2, 0), + make_tuple(4, 3, subpel_variance16x8_sse2, 0), + make_tuple(3, 4, subpel_variance8x16_sse2, 0), + make_tuple(3, 3, subpel_variance8x8_sse2, 0), + make_tuple(3, 2, subpel_variance8x4_sse2, 0), + make_tuple(2, 3, subpel_variance4x8_sse, 0), + make_tuple(2, 2, subpel_variance4x4_sse, 0))); + +const SubpixAvgVarMxNFunc subpel_avg_variance64x64_sse2 = + vpx_sub_pixel_avg_variance64x64_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance64x32_sse2 = + vpx_sub_pixel_avg_variance64x32_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance32x64_sse2 = + vpx_sub_pixel_avg_variance32x64_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance32x32_sse2 = + vpx_sub_pixel_avg_variance32x32_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance32x16_sse2 = + vpx_sub_pixel_avg_variance32x16_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance16x32_sse2 = + vpx_sub_pixel_avg_variance16x32_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance16x16_sse2 = + vpx_sub_pixel_avg_variance16x16_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance16x8_sse2 = + vpx_sub_pixel_avg_variance16x8_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance8x16_sse2 = + vpx_sub_pixel_avg_variance8x16_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance8x8_sse2 = + vpx_sub_pixel_avg_variance8x8_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance8x4_sse2 = + vpx_sub_pixel_avg_variance8x4_sse2; +const SubpixAvgVarMxNFunc subpel_avg_variance4x8_sse = + vpx_sub_pixel_avg_variance4x8_sse; +const SubpixAvgVarMxNFunc subpel_avg_variance4x4_sse = + vpx_sub_pixel_avg_variance4x4_sse; +INSTANTIATE_TEST_CASE_P( + SSE2, VpxSubpelAvgVarianceTest, + ::testing::Values( + make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0), + make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0), + make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0), + make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0), + make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0), + make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0), + make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0), + make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0), + make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0), + make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0), + make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0), + make_tuple(2, 3, subpel_avg_variance4x8_sse, 0), + make_tuple(2, 2, subpel_avg_variance4x4_sse, 0))); +#endif // CONFIG_USE_X86INC + #if CONFIG_VP9_HIGHBITDEPTH /* TODO(debargha): This test does not support the highbd version const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2; @@ -1103,794 +1512,303 @@ INSTANTIATE_TEST_CASE_P( make_tuple(4, 3, highbd_8_variance16x8_sse2, 8), make_tuple(3, 4, highbd_8_variance8x16_sse2, 8), make_tuple(3, 3, highbd_8_variance8x8_sse2, 8))); -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // HAVE_SSE2 - -#if CONFIG_VP8_ENCODER -typedef SubpelVarianceTest VP8SubpelVarianceTest; - -TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); } -TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } -#endif // CONFIG_VP8_ENCODER - -#if CONFIG_VP9_ENCODER -typedef SubpelVarianceTest VP9SubpelVarianceTest; -typedef SubpelVarianceTest VP9SubpelAvgVarianceTest; - -TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); } -TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } -TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); } - -#if CONFIG_VP9_HIGHBITDEPTH -typedef SubpelVarianceTest VP9SubpelVarianceHighTest; -typedef SubpelVarianceTest - VP9SubpelAvgVarianceHighTest; - -TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); } -TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); } -TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); } -#endif // CONFIG_VP9_HIGHBITDEPTH - -const SubpixVarMxNFunc subpel_variance4x4_c = vp9_sub_pixel_variance4x4_c; -const SubpixVarMxNFunc subpel_variance4x8_c = vp9_sub_pixel_variance4x8_c; -const SubpixVarMxNFunc subpel_variance8x4_c = vp9_sub_pixel_variance8x4_c; -const SubpixVarMxNFunc subpel_variance8x8_c = vp9_sub_pixel_variance8x8_c; -const SubpixVarMxNFunc subpel_variance8x16_c = vp9_sub_pixel_variance8x16_c; -const SubpixVarMxNFunc subpel_variance16x8_c = vp9_sub_pixel_variance16x8_c; -const SubpixVarMxNFunc subpel_variance16x16_c = vp9_sub_pixel_variance16x16_c; -const SubpixVarMxNFunc subpel_variance16x32_c = vp9_sub_pixel_variance16x32_c; -const SubpixVarMxNFunc subpel_variance32x16_c = vp9_sub_pixel_variance32x16_c; -const SubpixVarMxNFunc subpel_variance32x32_c = vp9_sub_pixel_variance32x32_c; -const SubpixVarMxNFunc subpel_variance32x64_c = vp9_sub_pixel_variance32x64_c; -const SubpixVarMxNFunc subpel_variance64x32_c = vp9_sub_pixel_variance64x32_c; -const SubpixVarMxNFunc subpel_variance64x64_c = vp9_sub_pixel_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0), - make_tuple(2, 3, subpel_variance4x8_c, 0), - make_tuple(3, 2, subpel_variance8x4_c, 0), - make_tuple(3, 3, subpel_variance8x8_c, 0), - make_tuple(3, 4, subpel_variance8x16_c, 0), - make_tuple(4, 3, subpel_variance16x8_c, 0), - make_tuple(4, 4, subpel_variance16x16_c, 0), - make_tuple(4, 5, subpel_variance16x32_c, 0), - make_tuple(5, 4, subpel_variance32x16_c, 0), - make_tuple(5, 5, subpel_variance32x32_c, 0), - make_tuple(5, 6, subpel_variance32x64_c, 0), - make_tuple(6, 5, subpel_variance64x32_c, 0), - make_tuple(6, 6, subpel_variance64x64_c, 0))); - -#if CONFIG_VP8_ENCODER -const SubpixVarMxNFunc vp8_subpel_variance16x16_c = - vp8_sub_pixel_variance16x16_c; -const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c; -const SubpixVarMxNFunc vp8_subpel_variance8x16_c = vp8_sub_pixel_variance8x16_c; -const SubpixVarMxNFunc vp8_subpel_variance8x8_c = vp8_sub_pixel_variance8x8_c; -const SubpixVarMxNFunc vp8_subpel_variance4x4_c = vp8_sub_pixel_variance4x4_c; -INSTANTIATE_TEST_CASE_P( - C, VP8SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_c, 0), - make_tuple(3, 3, vp8_subpel_variance8x8_c, 0), - make_tuple(3, 4, vp8_subpel_variance8x16_c, 0), - make_tuple(4, 3, vp8_subpel_variance16x8_c, 0), - make_tuple(4, 4, vp8_subpel_variance16x16_c, 0))); -#endif // CONFIG_VP8_ENCODER - -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c = - vp9_sub_pixel_avg_variance4x4_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c = - vp9_sub_pixel_avg_variance4x8_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c = - vp9_sub_pixel_avg_variance8x4_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c = - vp9_sub_pixel_avg_variance8x8_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c = - vp9_sub_pixel_avg_variance8x16_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c = - vp9_sub_pixel_avg_variance16x8_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c = - vp9_sub_pixel_avg_variance16x16_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c = - vp9_sub_pixel_avg_variance16x32_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c = - vp9_sub_pixel_avg_variance32x16_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c = - vp9_sub_pixel_avg_variance32x32_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c = - vp9_sub_pixel_avg_variance32x64_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c = - vp9_sub_pixel_avg_variance64x32_c; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c = - vp9_sub_pixel_avg_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0), - make_tuple(2, 3, subpel_avg_variance4x8_c, 0), - make_tuple(3, 2, subpel_avg_variance8x4_c, 0), - make_tuple(3, 3, subpel_avg_variance8x8_c, 0), - make_tuple(3, 4, subpel_avg_variance8x16_c, 0), - make_tuple(4, 3, subpel_avg_variance16x8_c, 0), - make_tuple(4, 4, subpel_avg_variance16x16_c, 0), - make_tuple(4, 5, subpel_avg_variance16x32_c, 0), - make_tuple(5, 4, subpel_avg_variance32x16_c, 0), - make_tuple(5, 5, subpel_avg_variance32x32_c, 0), - make_tuple(5, 6, subpel_avg_variance32x64_c, 0), - make_tuple(6, 5, subpel_avg_variance64x32_c, 0), - make_tuple(6, 6, subpel_avg_variance64x64_c, 0))); -#if CONFIG_VP9_HIGHBITDEPTH -const SubpixVarMxNFunc highbd_10_subpel_variance4x4_c = - vp9_highbd_10_sub_pixel_variance4x4_c; -const SubpixVarMxNFunc highbd_10_subpel_variance4x8_c = - vp9_highbd_10_sub_pixel_variance4x8_c; -const SubpixVarMxNFunc highbd_10_subpel_variance8x4_c = - vp9_highbd_10_sub_pixel_variance8x4_c; -const SubpixVarMxNFunc highbd_10_subpel_variance8x8_c = - vp9_highbd_10_sub_pixel_variance8x8_c; -const SubpixVarMxNFunc highbd_10_subpel_variance8x16_c = - vp9_highbd_10_sub_pixel_variance8x16_c; -const SubpixVarMxNFunc highbd_10_subpel_variance16x8_c = - vp9_highbd_10_sub_pixel_variance16x8_c; -const SubpixVarMxNFunc highbd_10_subpel_variance16x16_c = - vp9_highbd_10_sub_pixel_variance16x16_c; -const SubpixVarMxNFunc highbd_10_subpel_variance16x32_c = - vp9_highbd_10_sub_pixel_variance16x32_c; -const SubpixVarMxNFunc highbd_10_subpel_variance32x16_c = - vp9_highbd_10_sub_pixel_variance32x16_c; -const SubpixVarMxNFunc highbd_10_subpel_variance32x32_c = - vp9_highbd_10_sub_pixel_variance32x32_c; -const SubpixVarMxNFunc highbd_10_subpel_variance32x64_c = - vp9_highbd_10_sub_pixel_variance32x64_c; -const SubpixVarMxNFunc highbd_10_subpel_variance64x32_c = - vp9_highbd_10_sub_pixel_variance64x32_c; -const SubpixVarMxNFunc highbd_10_subpel_variance64x64_c = - vp9_highbd_10_sub_pixel_variance64x64_c; -const SubpixVarMxNFunc highbd_12_subpel_variance4x4_c = - vp9_highbd_12_sub_pixel_variance4x4_c; -const SubpixVarMxNFunc highbd_12_subpel_variance4x8_c = - vp9_highbd_12_sub_pixel_variance4x8_c; -const SubpixVarMxNFunc highbd_12_subpel_variance8x4_c = - vp9_highbd_12_sub_pixel_variance8x4_c; -const SubpixVarMxNFunc highbd_12_subpel_variance8x8_c = - vp9_highbd_12_sub_pixel_variance8x8_c; -const SubpixVarMxNFunc highbd_12_subpel_variance8x16_c = - vp9_highbd_12_sub_pixel_variance8x16_c; -const SubpixVarMxNFunc highbd_12_subpel_variance16x8_c = - vp9_highbd_12_sub_pixel_variance16x8_c; -const SubpixVarMxNFunc highbd_12_subpel_variance16x16_c = - vp9_highbd_12_sub_pixel_variance16x16_c; -const SubpixVarMxNFunc highbd_12_subpel_variance16x32_c = - vp9_highbd_12_sub_pixel_variance16x32_c; -const SubpixVarMxNFunc highbd_12_subpel_variance32x16_c = - vp9_highbd_12_sub_pixel_variance32x16_c; -const SubpixVarMxNFunc highbd_12_subpel_variance32x32_c = - vp9_highbd_12_sub_pixel_variance32x32_c; -const SubpixVarMxNFunc highbd_12_subpel_variance32x64_c = - vp9_highbd_12_sub_pixel_variance32x64_c; -const SubpixVarMxNFunc highbd_12_subpel_variance64x32_c = - vp9_highbd_12_sub_pixel_variance64x32_c; -const SubpixVarMxNFunc highbd_12_subpel_variance64x64_c = - vp9_highbd_12_sub_pixel_variance64x64_c; -const SubpixVarMxNFunc highbd_subpel_variance4x4_c = - vp9_highbd_sub_pixel_variance4x4_c; -const SubpixVarMxNFunc highbd_subpel_variance4x8_c = - vp9_highbd_sub_pixel_variance4x8_c; -const SubpixVarMxNFunc highbd_subpel_variance8x4_c = - vp9_highbd_sub_pixel_variance8x4_c; -const SubpixVarMxNFunc highbd_subpel_variance8x8_c = - vp9_highbd_sub_pixel_variance8x8_c; -const SubpixVarMxNFunc highbd_subpel_variance8x16_c = - vp9_highbd_sub_pixel_variance8x16_c; -const SubpixVarMxNFunc highbd_subpel_variance16x8_c = - vp9_highbd_sub_pixel_variance16x8_c; -const SubpixVarMxNFunc highbd_subpel_variance16x16_c = - vp9_highbd_sub_pixel_variance16x16_c; -const SubpixVarMxNFunc highbd_subpel_variance16x32_c = - vp9_highbd_sub_pixel_variance16x32_c; -const SubpixVarMxNFunc highbd_subpel_variance32x16_c = - vp9_highbd_sub_pixel_variance32x16_c; -const SubpixVarMxNFunc highbd_subpel_variance32x32_c = - vp9_highbd_sub_pixel_variance32x32_c; -const SubpixVarMxNFunc highbd_subpel_variance32x64_c = - vp9_highbd_sub_pixel_variance32x64_c; -const SubpixVarMxNFunc highbd_subpel_variance64x32_c = - vp9_highbd_sub_pixel_variance64x32_c; -const SubpixVarMxNFunc highbd_subpel_variance64x64_c = - vp9_highbd_sub_pixel_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9SubpelVarianceHighTest, - ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10), - make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10), - make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10), - make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10), - make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10), - make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10), - make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10), - make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10), - make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10), - make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10), - make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10), - make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10), - make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10), - make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12), - make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12), - make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12), - make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12), - make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12), - make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12), - make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12), - make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12), - make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12), - make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12), - make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12), - make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12), - make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12), - make_tuple(2, 2, highbd_subpel_variance4x4_c, 8), - make_tuple(2, 3, highbd_subpel_variance4x8_c, 8), - make_tuple(3, 2, highbd_subpel_variance8x4_c, 8), - make_tuple(3, 3, highbd_subpel_variance8x8_c, 8), - make_tuple(3, 4, highbd_subpel_variance8x16_c, 8), - make_tuple(4, 3, highbd_subpel_variance16x8_c, 8), - make_tuple(4, 4, highbd_subpel_variance16x16_c, 8), - make_tuple(4, 5, highbd_subpel_variance16x32_c, 8), - make_tuple(5, 4, highbd_subpel_variance32x16_c, 8), - make_tuple(5, 5, highbd_subpel_variance32x32_c, 8), - make_tuple(5, 6, highbd_subpel_variance32x64_c, 8), - make_tuple(6, 5, highbd_subpel_variance64x32_c, 8), - make_tuple(6, 6, highbd_subpel_variance64x64_c, 8))); -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c = - vp9_highbd_10_sub_pixel_avg_variance4x4_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c = - vp9_highbd_10_sub_pixel_avg_variance4x8_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c = - vp9_highbd_10_sub_pixel_avg_variance8x4_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c = - vp9_highbd_10_sub_pixel_avg_variance8x8_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c = - vp9_highbd_10_sub_pixel_avg_variance8x16_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c = - vp9_highbd_10_sub_pixel_avg_variance16x8_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c = - vp9_highbd_10_sub_pixel_avg_variance16x16_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c = - vp9_highbd_10_sub_pixel_avg_variance16x32_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c = - vp9_highbd_10_sub_pixel_avg_variance32x16_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c = - vp9_highbd_10_sub_pixel_avg_variance32x32_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c = - vp9_highbd_10_sub_pixel_avg_variance32x64_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c = - vp9_highbd_10_sub_pixel_avg_variance64x32_c; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c = - vp9_highbd_10_sub_pixel_avg_variance64x64_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c = - vp9_highbd_12_sub_pixel_avg_variance4x4_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c = - vp9_highbd_12_sub_pixel_avg_variance4x8_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c = - vp9_highbd_12_sub_pixel_avg_variance8x4_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c = - vp9_highbd_12_sub_pixel_avg_variance8x8_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c = - vp9_highbd_12_sub_pixel_avg_variance8x16_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c = - vp9_highbd_12_sub_pixel_avg_variance16x8_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c = - vp9_highbd_12_sub_pixel_avg_variance16x16_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c = - vp9_highbd_12_sub_pixel_avg_variance16x32_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c = - vp9_highbd_12_sub_pixel_avg_variance32x16_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c = - vp9_highbd_12_sub_pixel_avg_variance32x32_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c = - vp9_highbd_12_sub_pixel_avg_variance32x64_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c = - vp9_highbd_12_sub_pixel_avg_variance64x32_c; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c = - vp9_highbd_12_sub_pixel_avg_variance64x64_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c = - vp9_highbd_sub_pixel_avg_variance4x4_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c = - vp9_highbd_sub_pixel_avg_variance4x8_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c = - vp9_highbd_sub_pixel_avg_variance8x4_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c = - vp9_highbd_sub_pixel_avg_variance8x8_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c = - vp9_highbd_sub_pixel_avg_variance8x16_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c = - vp9_highbd_sub_pixel_avg_variance16x8_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c = - vp9_highbd_sub_pixel_avg_variance16x16_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c = - vp9_highbd_sub_pixel_avg_variance16x32_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c = - vp9_highbd_sub_pixel_avg_variance32x16_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c = - vp9_highbd_sub_pixel_avg_variance32x32_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c = - vp9_highbd_sub_pixel_avg_variance32x64_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c = - vp9_highbd_sub_pixel_avg_variance64x32_c; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c = - vp9_highbd_sub_pixel_avg_variance64x64_c; -INSTANTIATE_TEST_CASE_P( - C, VP9SubpelAvgVarianceHighTest, - ::testing::Values( - make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10), - make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10), - make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10), - make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10), - make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10), - make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10), - make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10), - make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10), - make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10), - make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10), - make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10), - make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10), - make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10), - make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12), - make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12), - make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12), - make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12), - make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12), - make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12), - make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12), - make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12), - make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12), - make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12), - make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12), - make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12), - make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12), - make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8), - make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8), - make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8), - make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8), - make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8), - make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8), - make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8), - make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8), - make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8), - make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8), - make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8), - make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8), - make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8))); -#endif // CONFIG_VP9_HIGHBITDEPTH -#endif // CONFIG_VP9_ENCODER - -#if CONFIG_VP8_ENCODER -#if HAVE_MMX -const SubpixVarMxNFunc subpel_variance16x16_mmx = - vp8_sub_pixel_variance16x16_mmx; -const SubpixVarMxNFunc subpel_variance16x8_mmx = vp8_sub_pixel_variance16x8_mmx; -const SubpixVarMxNFunc subpel_variance8x16_mmx = vp8_sub_pixel_variance8x16_mmx; -const SubpixVarMxNFunc subpel_variance8x8_mmx = vp8_sub_pixel_variance8x8_mmx; -const SubpixVarMxNFunc subpel_variance4x4_mmx = vp8_sub_pixel_variance4x4_mmx; -INSTANTIATE_TEST_CASE_P( - MMX, VP8SubpelVarianceTest, - ::testing::Values(make_tuple(4, 4, subpel_variance16x16_mmx, 0), - make_tuple(4, 3, subpel_variance16x8_mmx, 0), - make_tuple(3, 4, subpel_variance8x16_mmx, 0), - make_tuple(3, 3, subpel_variance8x8_mmx, 0), - make_tuple(2, 2, subpel_variance4x4_mmx, 0))); -#endif // HAVE_MMX -#endif // CONFIG_VP8_ENCODER -#if CONFIG_VP9_ENCODER -#if HAVE_SSE2 #if CONFIG_USE_X86INC -const SubpixVarMxNFunc subpel_variance4x4_sse = vp9_sub_pixel_variance4x4_sse; -const SubpixVarMxNFunc subpel_variance4x8_sse = vp9_sub_pixel_variance4x8_sse; -const SubpixVarMxNFunc subpel_variance8x4_sse2 = vp9_sub_pixel_variance8x4_sse2; -const SubpixVarMxNFunc subpel_variance8x8_sse2 = vp9_sub_pixel_variance8x8_sse2; -const SubpixVarMxNFunc subpel_variance8x16_sse2 = - vp9_sub_pixel_variance8x16_sse2; -const SubpixVarMxNFunc subpel_variance16x8_sse2 = - vp9_sub_pixel_variance16x8_sse2; -const SubpixVarMxNFunc subpel_variance16x16_sse2 = - vp9_sub_pixel_variance16x16_sse2; -const SubpixVarMxNFunc subpel_variance16x32_sse2 = - vp9_sub_pixel_variance16x32_sse2; -const SubpixVarMxNFunc subpel_variance32x16_sse2 = - vp9_sub_pixel_variance32x16_sse2; -const SubpixVarMxNFunc subpel_variance32x32_sse2 = - vp9_sub_pixel_variance32x32_sse2; -const SubpixVarMxNFunc subpel_variance32x64_sse2 = - vp9_sub_pixel_variance32x64_sse2; -const SubpixVarMxNFunc subpel_variance64x32_sse2 = - vp9_sub_pixel_variance64x32_sse2; -const SubpixVarMxNFunc subpel_variance64x64_sse2 = - vp9_sub_pixel_variance64x64_sse2; -INSTANTIATE_TEST_CASE_P( - SSE2, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0), - make_tuple(2, 3, subpel_variance4x8_sse, 0), - make_tuple(3, 2, subpel_variance8x4_sse2, 0), - make_tuple(3, 3, subpel_variance8x8_sse2, 0), - make_tuple(3, 4, subpel_variance8x16_sse2, 0), - make_tuple(4, 3, subpel_variance16x8_sse2, 0), - make_tuple(4, 4, subpel_variance16x16_sse2, 0), - make_tuple(4, 5, subpel_variance16x32_sse2, 0), - make_tuple(5, 4, subpel_variance32x16_sse2, 0), - make_tuple(5, 5, subpel_variance32x32_sse2, 0), - make_tuple(5, 6, subpel_variance32x64_sse2, 0), - make_tuple(6, 5, subpel_variance64x32_sse2, 0), - make_tuple(6, 6, subpel_variance64x64_sse2, 0))); -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse = - vp9_sub_pixel_avg_variance4x4_sse; -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse = - vp9_sub_pixel_avg_variance4x8_sse; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 = - vp9_sub_pixel_avg_variance8x4_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 = - vp9_sub_pixel_avg_variance8x8_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 = - vp9_sub_pixel_avg_variance8x16_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 = - vp9_sub_pixel_avg_variance16x8_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 = - vp9_sub_pixel_avg_variance16x16_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 = - vp9_sub_pixel_avg_variance16x32_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 = - vp9_sub_pixel_avg_variance32x16_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 = - vp9_sub_pixel_avg_variance32x32_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 = - vp9_sub_pixel_avg_variance32x64_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 = - vp9_sub_pixel_avg_variance64x32_sse2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 = - vp9_sub_pixel_avg_variance64x64_sse2; -INSTANTIATE_TEST_CASE_P( - SSE2, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0), - make_tuple(2, 3, subpel_avg_variance4x8_sse, 0), - make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0), - make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0), - make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0), - make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0), - make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0), - make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0), - make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0), - make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0), - make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0), - make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0), - make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0))); -#if CONFIG_VP9_HIGHBITDEPTH -const SubpixVarMxNFunc highbd_subpel_variance8x4_sse2 = - vp9_highbd_sub_pixel_variance8x4_sse2; -const SubpixVarMxNFunc highbd_subpel_variance8x8_sse2 = - vp9_highbd_sub_pixel_variance8x8_sse2; -const SubpixVarMxNFunc highbd_subpel_variance8x16_sse2 = - vp9_highbd_sub_pixel_variance8x16_sse2; -const SubpixVarMxNFunc highbd_subpel_variance16x8_sse2 = - vp9_highbd_sub_pixel_variance16x8_sse2; -const SubpixVarMxNFunc highbd_subpel_variance16x16_sse2 = - vp9_highbd_sub_pixel_variance16x16_sse2; -const SubpixVarMxNFunc highbd_subpel_variance16x32_sse2 = - vp9_highbd_sub_pixel_variance16x32_sse2; -const SubpixVarMxNFunc highbd_subpel_variance32x16_sse2 = - vp9_highbd_sub_pixel_variance32x16_sse2; -const SubpixVarMxNFunc highbd_subpel_variance32x32_sse2 = - vp9_highbd_sub_pixel_variance32x32_sse2; -const SubpixVarMxNFunc highbd_subpel_variance32x64_sse2 = - vp9_highbd_sub_pixel_variance32x64_sse2; -const SubpixVarMxNFunc highbd_subpel_variance64x32_sse2 = - vp9_highbd_sub_pixel_variance64x32_sse2; -const SubpixVarMxNFunc highbd_subpel_variance64x64_sse2 = - vp9_highbd_sub_pixel_variance64x64_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 = - vp9_highbd_10_sub_pixel_variance8x4_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 = - vp9_highbd_10_sub_pixel_variance8x8_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 = - vp9_highbd_10_sub_pixel_variance8x16_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 = - vp9_highbd_10_sub_pixel_variance16x8_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 = - vp9_highbd_10_sub_pixel_variance16x16_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 = - vp9_highbd_10_sub_pixel_variance16x32_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 = - vp9_highbd_10_sub_pixel_variance32x16_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 = - vp9_highbd_10_sub_pixel_variance32x32_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 = - vp9_highbd_10_sub_pixel_variance32x64_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 = - vp9_highbd_10_sub_pixel_variance64x32_sse2; -const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 = - vp9_highbd_10_sub_pixel_variance64x64_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 = - vp9_highbd_12_sub_pixel_variance8x4_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 = - vp9_highbd_12_sub_pixel_variance8x8_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 = - vp9_highbd_12_sub_pixel_variance8x16_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 = - vp9_highbd_12_sub_pixel_variance16x8_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 = - vp9_highbd_12_sub_pixel_variance16x16_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 = - vp9_highbd_12_sub_pixel_variance16x32_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 = - vp9_highbd_12_sub_pixel_variance32x16_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 = - vp9_highbd_12_sub_pixel_variance32x32_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 = - vp9_highbd_12_sub_pixel_variance32x64_sse2; -const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 = - vp9_highbd_12_sub_pixel_variance64x32_sse2; const SubpixVarMxNFunc highbd_12_subpel_variance64x64_sse2 = - vp9_highbd_12_sub_pixel_variance64x64_sse2; + vpx_highbd_12_sub_pixel_variance64x64_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 = + vpx_highbd_12_sub_pixel_variance64x32_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 = + vpx_highbd_12_sub_pixel_variance32x64_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 = + vpx_highbd_12_sub_pixel_variance32x32_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 = + vpx_highbd_12_sub_pixel_variance32x16_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 = + vpx_highbd_12_sub_pixel_variance16x32_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 = + vpx_highbd_12_sub_pixel_variance16x16_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 = + vpx_highbd_12_sub_pixel_variance16x8_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 = + vpx_highbd_12_sub_pixel_variance8x16_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 = + vpx_highbd_12_sub_pixel_variance8x8_sse2; +const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 = + vpx_highbd_12_sub_pixel_variance8x4_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 = + vpx_highbd_10_sub_pixel_variance64x64_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 = + vpx_highbd_10_sub_pixel_variance64x32_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 = + vpx_highbd_10_sub_pixel_variance32x64_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 = + vpx_highbd_10_sub_pixel_variance32x32_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 = + vpx_highbd_10_sub_pixel_variance32x16_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 = + vpx_highbd_10_sub_pixel_variance16x32_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 = + vpx_highbd_10_sub_pixel_variance16x16_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 = + vpx_highbd_10_sub_pixel_variance16x8_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 = + vpx_highbd_10_sub_pixel_variance8x16_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 = + vpx_highbd_10_sub_pixel_variance8x8_sse2; +const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 = + vpx_highbd_10_sub_pixel_variance8x4_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance64x64_sse2 = + vpx_highbd_8_sub_pixel_variance64x64_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance64x32_sse2 = + vpx_highbd_8_sub_pixel_variance64x32_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance32x64_sse2 = + vpx_highbd_8_sub_pixel_variance32x64_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance32x32_sse2 = + vpx_highbd_8_sub_pixel_variance32x32_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance32x16_sse2 = + vpx_highbd_8_sub_pixel_variance32x16_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance16x32_sse2 = + vpx_highbd_8_sub_pixel_variance16x32_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance16x16_sse2 = + vpx_highbd_8_sub_pixel_variance16x16_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance16x8_sse2 = + vpx_highbd_8_sub_pixel_variance16x8_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance8x16_sse2 = + vpx_highbd_8_sub_pixel_variance8x16_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance8x8_sse2 = + vpx_highbd_8_sub_pixel_variance8x8_sse2; +const SubpixVarMxNFunc highbd_8_subpel_variance8x4_sse2 = + vpx_highbd_8_sub_pixel_variance8x4_sse2; INSTANTIATE_TEST_CASE_P( - SSE2, VP9SubpelVarianceHighTest, - ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10), - make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10), - make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10), - make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10), - make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10), - make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10), - make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10), - make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10), - make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10), - make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10), - make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10), - make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12), - make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12), - make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12), - make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12), - make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12), - make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12), - make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12), - make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12), - make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12), + SSE2, VpxHBDSubpelVarianceTest, + ::testing::Values(make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12), make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12), - make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12), - make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8), - make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8), - make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8), - make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8), - make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8), - make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8), - make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8), - make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8), - make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8), - make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8), - make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8))); -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 = - vp9_highbd_sub_pixel_avg_variance8x4_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 = - vp9_highbd_sub_pixel_avg_variance8x8_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 = - vp9_highbd_sub_pixel_avg_variance8x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 = - vp9_highbd_sub_pixel_avg_variance16x8_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 = - vp9_highbd_sub_pixel_avg_variance16x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 = - vp9_highbd_sub_pixel_avg_variance16x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 = - vp9_highbd_sub_pixel_avg_variance32x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 = - vp9_highbd_sub_pixel_avg_variance32x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 = - vp9_highbd_sub_pixel_avg_variance32x64_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 = - vp9_highbd_sub_pixel_avg_variance64x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 = - vp9_highbd_sub_pixel_avg_variance64x64_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 = - vp9_highbd_10_sub_pixel_avg_variance8x4_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 = - vp9_highbd_10_sub_pixel_avg_variance8x8_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 = - vp9_highbd_10_sub_pixel_avg_variance8x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 = - vp9_highbd_10_sub_pixel_avg_variance16x8_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 = - vp9_highbd_10_sub_pixel_avg_variance16x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 = - vp9_highbd_10_sub_pixel_avg_variance16x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 = - vp9_highbd_10_sub_pixel_avg_variance32x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 = - vp9_highbd_10_sub_pixel_avg_variance32x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 = - vp9_highbd_10_sub_pixel_avg_variance32x64_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 = - vp9_highbd_10_sub_pixel_avg_variance64x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 = - vp9_highbd_10_sub_pixel_avg_variance64x64_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 = - vp9_highbd_12_sub_pixel_avg_variance8x4_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 = - vp9_highbd_12_sub_pixel_avg_variance8x8_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 = - vp9_highbd_12_sub_pixel_avg_variance8x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 = - vp9_highbd_12_sub_pixel_avg_variance16x8_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 = - vp9_highbd_12_sub_pixel_avg_variance16x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 = - vp9_highbd_12_sub_pixel_avg_variance16x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 = - vp9_highbd_12_sub_pixel_avg_variance32x16_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 = - vp9_highbd_12_sub_pixel_avg_variance32x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 = - vp9_highbd_12_sub_pixel_avg_variance32x64_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 = - vp9_highbd_12_sub_pixel_avg_variance64x32_sse2; -const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 = - vp9_highbd_12_sub_pixel_avg_variance64x64_sse2; + make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12), + make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12), + make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12), + make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12), + make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12), + make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12), + make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12), + make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12), + make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12), + make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10), + make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10), + make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10), + make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10), + make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10), + make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10), + make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10), + make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10), + make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10), + make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10), + make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10), + make_tuple(6, 6, highbd_8_subpel_variance64x64_sse2, 8), + make_tuple(6, 5, highbd_8_subpel_variance64x32_sse2, 8), + make_tuple(5, 6, highbd_8_subpel_variance32x64_sse2, 8), + make_tuple(5, 5, highbd_8_subpel_variance32x32_sse2, 8), + make_tuple(5, 4, highbd_8_subpel_variance32x16_sse2, 8), + make_tuple(4, 5, highbd_8_subpel_variance16x32_sse2, 8), + make_tuple(4, 4, highbd_8_subpel_variance16x16_sse2, 8), + make_tuple(4, 3, highbd_8_subpel_variance16x8_sse2, 8), + make_tuple(3, 4, highbd_8_subpel_variance8x16_sse2, 8), + make_tuple(3, 3, highbd_8_subpel_variance8x8_sse2, 8), + make_tuple(3, 2, highbd_8_subpel_variance8x4_sse2, 8))); + +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x64_sse2 = + vpx_highbd_12_sub_pixel_avg_variance64x64_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x32_sse2 = + vpx_highbd_12_sub_pixel_avg_variance64x32_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x64_sse2 = + vpx_highbd_12_sub_pixel_avg_variance32x64_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x32_sse2 = + vpx_highbd_12_sub_pixel_avg_variance32x32_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x16_sse2 = + vpx_highbd_12_sub_pixel_avg_variance32x16_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x32_sse2 = + vpx_highbd_12_sub_pixel_avg_variance16x32_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x16_sse2 = + vpx_highbd_12_sub_pixel_avg_variance16x16_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x8_sse2 = + vpx_highbd_12_sub_pixel_avg_variance16x8_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x16_sse2 = + vpx_highbd_12_sub_pixel_avg_variance8x16_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x8_sse2 = + vpx_highbd_12_sub_pixel_avg_variance8x8_sse2; +const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x4_sse2 = + vpx_highbd_12_sub_pixel_avg_variance8x4_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x64_sse2 = + vpx_highbd_10_sub_pixel_avg_variance64x64_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x32_sse2 = + vpx_highbd_10_sub_pixel_avg_variance64x32_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x64_sse2 = + vpx_highbd_10_sub_pixel_avg_variance32x64_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x32_sse2 = + vpx_highbd_10_sub_pixel_avg_variance32x32_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x16_sse2 = + vpx_highbd_10_sub_pixel_avg_variance32x16_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x32_sse2 = + vpx_highbd_10_sub_pixel_avg_variance16x32_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x16_sse2 = + vpx_highbd_10_sub_pixel_avg_variance16x16_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x8_sse2 = + vpx_highbd_10_sub_pixel_avg_variance16x8_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x16_sse2 = + vpx_highbd_10_sub_pixel_avg_variance8x16_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x8_sse2 = + vpx_highbd_10_sub_pixel_avg_variance8x8_sse2; +const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x4_sse2 = + vpx_highbd_10_sub_pixel_avg_variance8x4_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x64_sse2 = + vpx_highbd_8_sub_pixel_avg_variance64x64_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x32_sse2 = + vpx_highbd_8_sub_pixel_avg_variance64x32_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x64_sse2 = + vpx_highbd_8_sub_pixel_avg_variance32x64_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x32_sse2 = + vpx_highbd_8_sub_pixel_avg_variance32x32_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x16_sse2 = + vpx_highbd_8_sub_pixel_avg_variance32x16_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x32_sse2 = + vpx_highbd_8_sub_pixel_avg_variance16x32_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x16_sse2 = + vpx_highbd_8_sub_pixel_avg_variance16x16_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x8_sse2 = + vpx_highbd_8_sub_pixel_avg_variance16x8_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x16_sse2 = + vpx_highbd_8_sub_pixel_avg_variance8x16_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x8_sse2 = + vpx_highbd_8_sub_pixel_avg_variance8x8_sse2; +const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x4_sse2 = + vpx_highbd_8_sub_pixel_avg_variance8x4_sse2; INSTANTIATE_TEST_CASE_P( - SSE2, VP9SubpelAvgVarianceHighTest, + SSE2, VpxHBDSubpelAvgVarianceTest, ::testing::Values( - make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10), - make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10), - make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10), - make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10), - make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10), - make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10), - make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10), - make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10), - make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10), - make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10), - make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10), - make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12), - make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12), - make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12), - make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12), - make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12), - make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12), - make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12), - make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12), - make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12), - make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12), - make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12), - make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8), - make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8), - make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8), - make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8), - make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8), - make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8), - make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8), - make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8), - make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8), - make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8), - make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8))); -#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12), + make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12), + make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12), + make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12), + make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12), + make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12), + make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12), + make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12), + make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12), + make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12), + make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12), + make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10), + make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10), + make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10), + make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10), + make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10), + make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10), + make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10), + make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10), + make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10), + make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10), + make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10), + make_tuple(6, 6, highbd_8_subpel_avg_variance64x64_sse2, 8), + make_tuple(6, 5, highbd_8_subpel_avg_variance64x32_sse2, 8), + make_tuple(5, 6, highbd_8_subpel_avg_variance32x64_sse2, 8), + make_tuple(5, 5, highbd_8_subpel_avg_variance32x32_sse2, 8), + make_tuple(5, 4, highbd_8_subpel_avg_variance32x16_sse2, 8), + make_tuple(4, 5, highbd_8_subpel_avg_variance16x32_sse2, 8), + make_tuple(4, 4, highbd_8_subpel_avg_variance16x16_sse2, 8), + make_tuple(4, 3, highbd_8_subpel_avg_variance16x8_sse2, 8), + make_tuple(3, 4, highbd_8_subpel_avg_variance8x16_sse2, 8), + make_tuple(3, 3, highbd_8_subpel_avg_variance8x8_sse2, 8), + make_tuple(3, 2, highbd_8_subpel_avg_variance8x4_sse2, 8))); #endif // CONFIG_USE_X86INC +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_SSE2 -#endif // CONFIG_VP9_ENCODER - -#if CONFIG_VP8_ENCODER -#if HAVE_SSE2 -const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 = - vp8_sub_pixel_variance16x16_wmt; -const SubpixVarMxNFunc vp8_subpel_variance16x8_sse2 = - vp8_sub_pixel_variance16x8_wmt; -const SubpixVarMxNFunc vp8_subpel_variance8x16_sse2 = - vp8_sub_pixel_variance8x16_wmt; -const SubpixVarMxNFunc vp8_subpel_variance8x8_sse2 = - vp8_sub_pixel_variance8x8_wmt; -const SubpixVarMxNFunc vp8_subpel_variance4x4_sse2 = - vp8_sub_pixel_variance4x4_wmt; -INSTANTIATE_TEST_CASE_P( - SSE2, VP8SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_sse2, 0), - make_tuple(3, 3, vp8_subpel_variance8x8_sse2, 0), - make_tuple(3, 4, vp8_subpel_variance8x16_sse2, 0), - make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0), - make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0))); -#endif // HAVE_SSE2 -#endif // CONFIG_VP8_ENCODER -#if CONFIG_VP9_ENCODER #if HAVE_SSSE3 #if CONFIG_USE_X86INC -const SubpixVarMxNFunc subpel_variance4x4_ssse3 = - vp9_sub_pixel_variance4x4_ssse3; -const SubpixVarMxNFunc subpel_variance4x8_ssse3 = - vp9_sub_pixel_variance4x8_ssse3; -const SubpixVarMxNFunc subpel_variance8x4_ssse3 = - vp9_sub_pixel_variance8x4_ssse3; -const SubpixVarMxNFunc subpel_variance8x8_ssse3 = - vp9_sub_pixel_variance8x8_ssse3; -const SubpixVarMxNFunc subpel_variance8x16_ssse3 = - vp9_sub_pixel_variance8x16_ssse3; -const SubpixVarMxNFunc subpel_variance16x8_ssse3 = - vp9_sub_pixel_variance16x8_ssse3; -const SubpixVarMxNFunc subpel_variance16x16_ssse3 = - vp9_sub_pixel_variance16x16_ssse3; -const SubpixVarMxNFunc subpel_variance16x32_ssse3 = - vp9_sub_pixel_variance16x32_ssse3; -const SubpixVarMxNFunc subpel_variance32x16_ssse3 = - vp9_sub_pixel_variance32x16_ssse3; -const SubpixVarMxNFunc subpel_variance32x32_ssse3 = - vp9_sub_pixel_variance32x32_ssse3; -const SubpixVarMxNFunc subpel_variance32x64_ssse3 = - vp9_sub_pixel_variance32x64_ssse3; -const SubpixVarMxNFunc subpel_variance64x32_ssse3 = - vp9_sub_pixel_variance64x32_ssse3; const SubpixVarMxNFunc subpel_variance64x64_ssse3 = - vp9_sub_pixel_variance64x64_ssse3; + vpx_sub_pixel_variance64x64_ssse3; +const SubpixVarMxNFunc subpel_variance64x32_ssse3 = + vpx_sub_pixel_variance64x32_ssse3; +const SubpixVarMxNFunc subpel_variance32x64_ssse3 = + vpx_sub_pixel_variance32x64_ssse3; +const SubpixVarMxNFunc subpel_variance32x32_ssse3 = + vpx_sub_pixel_variance32x32_ssse3; +const SubpixVarMxNFunc subpel_variance32x16_ssse3 = + vpx_sub_pixel_variance32x16_ssse3; +const SubpixVarMxNFunc subpel_variance16x32_ssse3 = + vpx_sub_pixel_variance16x32_ssse3; +const SubpixVarMxNFunc subpel_variance16x16_ssse3 = + vpx_sub_pixel_variance16x16_ssse3; +const SubpixVarMxNFunc subpel_variance16x8_ssse3 = + vpx_sub_pixel_variance16x8_ssse3; +const SubpixVarMxNFunc subpel_variance8x16_ssse3 = + vpx_sub_pixel_variance8x16_ssse3; +const SubpixVarMxNFunc subpel_variance8x8_ssse3 = + vpx_sub_pixel_variance8x8_ssse3; +const SubpixVarMxNFunc subpel_variance8x4_ssse3 = + vpx_sub_pixel_variance8x4_ssse3; +const SubpixVarMxNFunc subpel_variance4x8_ssse3 = + vpx_sub_pixel_variance4x8_ssse3; +const SubpixVarMxNFunc subpel_variance4x4_ssse3 = + vpx_sub_pixel_variance4x4_ssse3; INSTANTIATE_TEST_CASE_P( - SSSE3, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0), - make_tuple(2, 3, subpel_variance4x8_ssse3, 0), - make_tuple(3, 2, subpel_variance8x4_ssse3, 0), - make_tuple(3, 3, subpel_variance8x8_ssse3, 0), - make_tuple(3, 4, subpel_variance8x16_ssse3, 0), - make_tuple(4, 3, subpel_variance16x8_ssse3, 0), - make_tuple(4, 4, subpel_variance16x16_ssse3, 0), - make_tuple(4, 5, subpel_variance16x32_ssse3, 0), - make_tuple(5, 4, subpel_variance32x16_ssse3, 0), - make_tuple(5, 5, subpel_variance32x32_ssse3, 0), - make_tuple(5, 6, subpel_variance32x64_ssse3, 0), + SSSE3, VpxSubpelVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_variance64x64_ssse3, 0), make_tuple(6, 5, subpel_variance64x32_ssse3, 0), - make_tuple(6, 6, subpel_variance64x64_ssse3, 0))); -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 = - vp9_sub_pixel_avg_variance4x4_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 = - vp9_sub_pixel_avg_variance4x8_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 = - vp9_sub_pixel_avg_variance8x4_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 = - vp9_sub_pixel_avg_variance8x8_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 = - vp9_sub_pixel_avg_variance8x16_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 = - vp9_sub_pixel_avg_variance16x8_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 = - vp9_sub_pixel_avg_variance16x16_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 = - vp9_sub_pixel_avg_variance16x32_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 = - vp9_sub_pixel_avg_variance32x16_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 = - vp9_sub_pixel_avg_variance32x32_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 = - vp9_sub_pixel_avg_variance32x64_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 = - vp9_sub_pixel_avg_variance64x32_ssse3; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 = - vp9_sub_pixel_avg_variance64x64_ssse3; + make_tuple(5, 6, subpel_variance32x64_ssse3, 0), + make_tuple(5, 5, subpel_variance32x32_ssse3, 0), + make_tuple(5, 4, subpel_variance32x16_ssse3, 0), + make_tuple(4, 5, subpel_variance16x32_ssse3, 0), + make_tuple(4, 4, subpel_variance16x16_ssse3, 0), + make_tuple(4, 3, subpel_variance16x8_ssse3, 0), + make_tuple(3, 4, subpel_variance8x16_ssse3, 0), + make_tuple(3, 3, subpel_variance8x8_ssse3, 0), + make_tuple(3, 2, subpel_variance8x4_ssse3, 0), + make_tuple(2, 3, subpel_variance4x8_ssse3, 0), + make_tuple(2, 2, subpel_variance4x4_ssse3, 0))); + +const SubpixAvgVarMxNFunc subpel_avg_variance64x64_ssse3 = + vpx_sub_pixel_avg_variance64x64_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance64x32_ssse3 = + vpx_sub_pixel_avg_variance64x32_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance32x64_ssse3 = + vpx_sub_pixel_avg_variance32x64_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance32x32_ssse3 = + vpx_sub_pixel_avg_variance32x32_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance32x16_ssse3 = + vpx_sub_pixel_avg_variance32x16_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance16x32_ssse3 = + vpx_sub_pixel_avg_variance16x32_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance16x16_ssse3 = + vpx_sub_pixel_avg_variance16x16_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance16x8_ssse3 = + vpx_sub_pixel_avg_variance16x8_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance8x16_ssse3 = + vpx_sub_pixel_avg_variance8x16_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance8x8_ssse3 = + vpx_sub_pixel_avg_variance8x8_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance8x4_ssse3 = + vpx_sub_pixel_avg_variance8x4_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance4x8_ssse3 = + vpx_sub_pixel_avg_variance4x8_ssse3; +const SubpixAvgVarMxNFunc subpel_avg_variance4x4_ssse3 = + vpx_sub_pixel_avg_variance4x4_ssse3; INSTANTIATE_TEST_CASE_P( - SSSE3, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0), - make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0), - make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0), - make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0), - make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0), - make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0), - make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0), - make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0), - make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0), - make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0), - make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0), + SSSE3, VpxSubpelAvgVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0), make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0), - make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0))); + make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0), + make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0), + make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0), + make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0), + make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0), + make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0), + make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0), + make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0), + make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0), + make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0), + make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0))); #endif // CONFIG_USE_X86INC #endif // HAVE_SSSE3 -#endif // CONFIG_VP9_ENCODER - -#if CONFIG_VP8_ENCODER -#if HAVE_SSSE3 -const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 = - vp8_sub_pixel_variance16x16_ssse3; -const SubpixVarMxNFunc vp8_subpel_variance16x8_ssse3 = - vp8_sub_pixel_variance16x8_ssse3; -INSTANTIATE_TEST_CASE_P( - SSSE3, VP8SubpelVarianceTest, - ::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0), - make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0))); -#endif // HAVE_SSSE3 -#endif // CONFIG_VP8_ENCODER #if HAVE_AVX2 const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2; @@ -1910,39 +1828,46 @@ INSTANTIATE_TEST_CASE_P( make_tuple(5, 4, variance32x16_avx2, 0), make_tuple(4, 4, variance16x16_avx2, 0))); -#if CONFIG_VP9_ENCODER -const SubpixVarMxNFunc subpel_variance32x32_avx2 = - vp9_sub_pixel_variance32x32_avx2; const SubpixVarMxNFunc subpel_variance64x64_avx2 = - vp9_sub_pixel_variance64x64_avx2; + vpx_sub_pixel_variance64x64_avx2; +const SubpixVarMxNFunc subpel_variance32x32_avx2 = + vpx_sub_pixel_variance32x32_avx2; INSTANTIATE_TEST_CASE_P( - AVX2, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0), - make_tuple(6, 6, subpel_variance64x64_avx2, 0))); - -const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 = - vp9_sub_pixel_avg_variance32x32_avx2; -const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 = - vp9_sub_pixel_avg_variance64x64_avx2; + AVX2, VpxSubpelVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_variance64x64_avx2, 0), + make_tuple(5, 5, subpel_variance32x32_avx2, 0))); + +const SubpixAvgVarMxNFunc subpel_avg_variance64x64_avx2 = + vpx_sub_pixel_avg_variance64x64_avx2; +const SubpixAvgVarMxNFunc subpel_avg_variance32x32_avx2 = + vpx_sub_pixel_avg_variance32x32_avx2; INSTANTIATE_TEST_CASE_P( - AVX2, VP9SubpelAvgVarianceTest, - ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0), - make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0))); -#endif // CONFIG_VP9_ENCODER + AVX2, VpxSubpelAvgVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0), + make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0))); #endif // HAVE_AVX2 -#if CONFIG_VP8_ENCODER #if HAVE_MEDIA +const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media; +INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_media))); + +const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media; +const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media; +INSTANTIATE_TEST_CASE_P( + MEDIA, VpxVarianceTest, + ::testing::Values(make_tuple(4, 4, variance16x16_media, 0), + make_tuple(3, 3, variance8x8_media, 0))); + const SubpixVarMxNFunc subpel_variance16x16_media = - vp8_sub_pixel_variance16x16_armv6; + vpx_sub_pixel_variance16x16_media; const SubpixVarMxNFunc subpel_variance8x8_media = - vp8_sub_pixel_variance8x8_armv6; + vpx_sub_pixel_variance8x8_media; INSTANTIATE_TEST_CASE_P( - MEDIA, VP8SubpelVarianceTest, - ::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0), - make_tuple(4, 4, subpel_variance16x16_media, 0))); + MEDIA, VpxSubpelVarianceTest, + ::testing::Values(make_tuple(4, 4, subpel_variance16x16_media, 0), + make_tuple(3, 3, subpel_variance8x8_media, 0))); #endif // HAVE_MEDIA -#endif // CONFIG_VP8_ENCODER #if HAVE_NEON const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon; @@ -1972,46 +1897,21 @@ INSTANTIATE_TEST_CASE_P( make_tuple(3, 4, variance8x16_neon, 0), make_tuple(3, 3, variance8x8_neon, 0))); -#if CONFIG_VP8_ENCODER -#if HAVE_NEON_ASM -const SubpixVarMxNFunc vp8_subpel_variance16x16_neon = - vp8_sub_pixel_variance16x16_neon; -INSTANTIATE_TEST_CASE_P( - NEON, VP8SubpelVarianceTest, - ::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0))); -#endif // HAVE_NEON_ASM -#endif // CONFIG_VP8_ENCODER - -#if CONFIG_VP9_ENCODER -const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon; -const SubpixVarMxNFunc subpel_variance16x16_neon = - vp9_sub_pixel_variance16x16_neon; -const SubpixVarMxNFunc subpel_variance32x32_neon = - vp9_sub_pixel_variance32x32_neon; const SubpixVarMxNFunc subpel_variance64x64_neon = - vp9_sub_pixel_variance64x64_neon; + vpx_sub_pixel_variance64x64_neon; +const SubpixVarMxNFunc subpel_variance32x32_neon = + vpx_sub_pixel_variance32x32_neon; +const SubpixVarMxNFunc subpel_variance16x16_neon = + vpx_sub_pixel_variance16x16_neon; +const SubpixVarMxNFunc subpel_variance8x8_neon = vpx_sub_pixel_variance8x8_neon; INSTANTIATE_TEST_CASE_P( - NEON, VP9SubpelVarianceTest, - ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0), - make_tuple(4, 4, subpel_variance16x16_neon, 0), + NEON, VpxSubpelVarianceTest, + ::testing::Values(make_tuple(6, 6, subpel_variance64x64_neon, 0), make_tuple(5, 5, subpel_variance32x32_neon, 0), - make_tuple(6, 6, subpel_variance64x64_neon, 0))); -#endif // CONFIG_VP9_ENCODER + make_tuple(4, 4, subpel_variance16x16_neon, 0), + make_tuple(3, 3, subpel_variance8x8_neon, 0))); #endif // HAVE_NEON -#if HAVE_MEDIA -const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media; -INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest, - ::testing::Values(make_tuple(4, 4, mse16x16_media))); - -const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media; -const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media; -INSTANTIATE_TEST_CASE_P( - MEDIA, VpxVarianceTest, - ::testing::Values(make_tuple(4, 4, variance16x16_media, 0), - make_tuple(3, 3, variance8x8_media, 0))); -#endif // HAVE_MEDIA - #if HAVE_MSA INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest, ::testing::Values(vpx_get_mb_ss_msa)); @@ -2059,29 +1959,28 @@ INSTANTIATE_TEST_CASE_P( make_tuple(2, 3, variance4x8_msa, 0), make_tuple(2, 2, variance4x4_msa, 0))); -#if CONFIG_VP9_ENCODER -const SubpixVarMxNFunc subpel_variance4x4_msa = vp9_sub_pixel_variance4x4_msa; -const SubpixVarMxNFunc subpel_variance4x8_msa = vp9_sub_pixel_variance4x8_msa; -const SubpixVarMxNFunc subpel_variance8x4_msa = vp9_sub_pixel_variance8x4_msa; -const SubpixVarMxNFunc subpel_variance8x8_msa = vp9_sub_pixel_variance8x8_msa; -const SubpixVarMxNFunc subpel_variance8x16_msa = vp9_sub_pixel_variance8x16_msa; -const SubpixVarMxNFunc subpel_variance16x8_msa = vp9_sub_pixel_variance16x8_msa; +const SubpixVarMxNFunc subpel_variance4x4_msa = vpx_sub_pixel_variance4x4_msa; +const SubpixVarMxNFunc subpel_variance4x8_msa = vpx_sub_pixel_variance4x8_msa; +const SubpixVarMxNFunc subpel_variance8x4_msa = vpx_sub_pixel_variance8x4_msa; +const SubpixVarMxNFunc subpel_variance8x8_msa = vpx_sub_pixel_variance8x8_msa; +const SubpixVarMxNFunc subpel_variance8x16_msa = vpx_sub_pixel_variance8x16_msa; +const SubpixVarMxNFunc subpel_variance16x8_msa = vpx_sub_pixel_variance16x8_msa; const SubpixVarMxNFunc subpel_variance16x16_msa = - vp9_sub_pixel_variance16x16_msa; + vpx_sub_pixel_variance16x16_msa; const SubpixVarMxNFunc subpel_variance16x32_msa = - vp9_sub_pixel_variance16x32_msa; + vpx_sub_pixel_variance16x32_msa; const SubpixVarMxNFunc subpel_variance32x16_msa = - vp9_sub_pixel_variance32x16_msa; + vpx_sub_pixel_variance32x16_msa; const SubpixVarMxNFunc subpel_variance32x32_msa = - vp9_sub_pixel_variance32x32_msa; + vpx_sub_pixel_variance32x32_msa; const SubpixVarMxNFunc subpel_variance32x64_msa = - vp9_sub_pixel_variance32x64_msa; + vpx_sub_pixel_variance32x64_msa; const SubpixVarMxNFunc subpel_variance64x32_msa = - vp9_sub_pixel_variance64x32_msa; + vpx_sub_pixel_variance64x32_msa; const SubpixVarMxNFunc subpel_variance64x64_msa = - vp9_sub_pixel_variance64x64_msa; + vpx_sub_pixel_variance64x64_msa; INSTANTIATE_TEST_CASE_P( - MSA, VP9SubpelVarianceTest, + MSA, VpxSubpelVarianceTest, ::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0), make_tuple(2, 3, subpel_variance4x8_msa, 0), make_tuple(3, 2, subpel_variance8x4_msa, 0), @@ -2095,6 +1994,5 @@ INSTANTIATE_TEST_CASE_P( make_tuple(5, 6, subpel_variance32x64_msa, 0), make_tuple(6, 5, subpel_variance64x32_msa, 0), make_tuple(6, 6, subpel_variance64x64_msa, 0))); -#endif // CONFIG_VP9_ENCODER #endif // HAVE_MSA } // namespace diff --git a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c deleted file mode 100644 index 3c8ed11f0..000000000 --- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c +++ /dev/null @@ -1,1017 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include "vpx_ports/mem.h" -#include "vpx/vpx_integer.h" - -static const uint8_t bilinear_taps_coeff[8][2] = { - {128, 0}, - {112, 16}, - { 96, 32}, - { 80, 48}, - { 64, 64}, - { 48, 80}, - { 32, 96}, - { 16, 112} -}; - -unsigned int vp8_sub_pixel_variance16x16_neon_func( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) { - int i; - DECLARE_ALIGNED(16, unsigned char, tmp[528]); - unsigned char *tmpp; - unsigned char *tmpp2; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8; - uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8; - uint8x8_t d19u8, d20u8, d21u8; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64, d2s64, d3s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; - uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16; - uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - tmpp2 = tmp + 272; - tmpp = tmp; - if (xoffset == 0) { // secondpass_bfilter16x16_only - d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); - d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); - - q11u8 = vld1q_u8(src_ptr); - src_ptr += src_pixels_per_line; - for (i = 4; i > 0; i--) { - q12u8 = vld1q_u8(src_ptr); - src_ptr += src_pixels_per_line; - q13u8 = vld1q_u8(src_ptr); - src_ptr += src_pixels_per_line; - q14u8 = vld1q_u8(src_ptr); - src_ptr += src_pixels_per_line; - q15u8 = vld1q_u8(src_ptr); - src_ptr += src_pixels_per_line; - - __builtin_prefetch(src_ptr); - __builtin_prefetch(src_ptr + src_pixels_per_line); - __builtin_prefetch(src_ptr + src_pixels_per_line * 2); - - q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); - q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); - q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); - q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); - q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); - q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); - q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); - q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); - - q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); - q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); - q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); - q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); - q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); - q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); - q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); - q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); - - d2u8 = vqrshrn_n_u16(q1u16, 7); - d3u8 = vqrshrn_n_u16(q2u16, 7); - d4u8 = vqrshrn_n_u16(q3u16, 7); - d5u8 = vqrshrn_n_u16(q4u16, 7); - d6u8 = vqrshrn_n_u16(q5u16, 7); - d7u8 = vqrshrn_n_u16(q6u16, 7); - d8u8 = vqrshrn_n_u16(q7u16, 7); - d9u8 = vqrshrn_n_u16(q8u16, 7); - - q1u8 = vcombine_u8(d2u8, d3u8); - q2u8 = vcombine_u8(d4u8, d5u8); - q3u8 = vcombine_u8(d6u8, d7u8); - q4u8 = vcombine_u8(d8u8, d9u8); - - q11u8 = q15u8; - - vst1q_u8((uint8_t *)tmpp2, q1u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q2u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q3u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q4u8); - tmpp2 += 16; - } - } else if (yoffset == 0) { // firstpass_bfilter16x16_only - d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); - d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); - - for (i = 4; i > 0 ; i--) { - d2u8 = vld1_u8(src_ptr); - d3u8 = vld1_u8(src_ptr + 8); - d4u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d5u8 = vld1_u8(src_ptr); - d6u8 = vld1_u8(src_ptr + 8); - d7u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d8u8 = vld1_u8(src_ptr); - d9u8 = vld1_u8(src_ptr + 8); - d10u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d11u8 = vld1_u8(src_ptr); - d12u8 = vld1_u8(src_ptr + 8); - d13u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - - __builtin_prefetch(src_ptr); - __builtin_prefetch(src_ptr + src_pixels_per_line); - __builtin_prefetch(src_ptr + src_pixels_per_line * 2); - - q7u16 = vmull_u8(d2u8, d0u8); - q8u16 = vmull_u8(d3u8, d0u8); - q9u16 = vmull_u8(d5u8, d0u8); - q10u16 = vmull_u8(d6u8, d0u8); - q11u16 = vmull_u8(d8u8, d0u8); - q12u16 = vmull_u8(d9u8, d0u8); - q13u16 = vmull_u8(d11u8, d0u8); - q14u16 = vmull_u8(d12u8, d0u8); - - d2u8 = vext_u8(d2u8, d3u8, 1); - d5u8 = vext_u8(d5u8, d6u8, 1); - d8u8 = vext_u8(d8u8, d9u8, 1); - d11u8 = vext_u8(d11u8, d12u8, 1); - - q7u16 = vmlal_u8(q7u16, d2u8, d1u8); - q9u16 = vmlal_u8(q9u16, d5u8, d1u8); - q11u16 = vmlal_u8(q11u16, d8u8, d1u8); - q13u16 = vmlal_u8(q13u16, d11u8, d1u8); - - d3u8 = vext_u8(d3u8, d4u8, 1); - d6u8 = vext_u8(d6u8, d7u8, 1); - d9u8 = vext_u8(d9u8, d10u8, 1); - d12u8 = vext_u8(d12u8, d13u8, 1); - - q8u16 = vmlal_u8(q8u16, d3u8, d1u8); - q10u16 = vmlal_u8(q10u16, d6u8, d1u8); - q12u16 = vmlal_u8(q12u16, d9u8, d1u8); - q14u16 = vmlal_u8(q14u16, d12u8, d1u8); - - d14u8 = vqrshrn_n_u16(q7u16, 7); - d15u8 = vqrshrn_n_u16(q8u16, 7); - d16u8 = vqrshrn_n_u16(q9u16, 7); - d17u8 = vqrshrn_n_u16(q10u16, 7); - d18u8 = vqrshrn_n_u16(q11u16, 7); - d19u8 = vqrshrn_n_u16(q12u16, 7); - d20u8 = vqrshrn_n_u16(q13u16, 7); - d21u8 = vqrshrn_n_u16(q14u16, 7); - - q7u8 = vcombine_u8(d14u8, d15u8); - q8u8 = vcombine_u8(d16u8, d17u8); - q9u8 = vcombine_u8(d18u8, d19u8); - q10u8 = vcombine_u8(d20u8, d21u8); - - vst1q_u8((uint8_t *)tmpp2, q7u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q8u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q9u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q10u8); - tmpp2 += 16; - } - } else { - d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]); - d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]); - - d2u8 = vld1_u8(src_ptr); - d3u8 = vld1_u8(src_ptr + 8); - d4u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d5u8 = vld1_u8(src_ptr); - d6u8 = vld1_u8(src_ptr + 8); - d7u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d8u8 = vld1_u8(src_ptr); - d9u8 = vld1_u8(src_ptr + 8); - d10u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d11u8 = vld1_u8(src_ptr); - d12u8 = vld1_u8(src_ptr + 8); - d13u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - - // First Pass: output_height lines x output_width columns (17x16) - for (i = 3; i > 0; i--) { - q7u16 = vmull_u8(d2u8, d0u8); - q8u16 = vmull_u8(d3u8, d0u8); - q9u16 = vmull_u8(d5u8, d0u8); - q10u16 = vmull_u8(d6u8, d0u8); - q11u16 = vmull_u8(d8u8, d0u8); - q12u16 = vmull_u8(d9u8, d0u8); - q13u16 = vmull_u8(d11u8, d0u8); - q14u16 = vmull_u8(d12u8, d0u8); - - d2u8 = vext_u8(d2u8, d3u8, 1); - d5u8 = vext_u8(d5u8, d6u8, 1); - d8u8 = vext_u8(d8u8, d9u8, 1); - d11u8 = vext_u8(d11u8, d12u8, 1); - - q7u16 = vmlal_u8(q7u16, d2u8, d1u8); - q9u16 = vmlal_u8(q9u16, d5u8, d1u8); - q11u16 = vmlal_u8(q11u16, d8u8, d1u8); - q13u16 = vmlal_u8(q13u16, d11u8, d1u8); - - d3u8 = vext_u8(d3u8, d4u8, 1); - d6u8 = vext_u8(d6u8, d7u8, 1); - d9u8 = vext_u8(d9u8, d10u8, 1); - d12u8 = vext_u8(d12u8, d13u8, 1); - - q8u16 = vmlal_u8(q8u16, d3u8, d1u8); - q10u16 = vmlal_u8(q10u16, d6u8, d1u8); - q12u16 = vmlal_u8(q12u16, d9u8, d1u8); - q14u16 = vmlal_u8(q14u16, d12u8, d1u8); - - d14u8 = vqrshrn_n_u16(q7u16, 7); - d15u8 = vqrshrn_n_u16(q8u16, 7); - d16u8 = vqrshrn_n_u16(q9u16, 7); - d17u8 = vqrshrn_n_u16(q10u16, 7); - d18u8 = vqrshrn_n_u16(q11u16, 7); - d19u8 = vqrshrn_n_u16(q12u16, 7); - d20u8 = vqrshrn_n_u16(q13u16, 7); - d21u8 = vqrshrn_n_u16(q14u16, 7); - - d2u8 = vld1_u8(src_ptr); - d3u8 = vld1_u8(src_ptr + 8); - d4u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d5u8 = vld1_u8(src_ptr); - d6u8 = vld1_u8(src_ptr + 8); - d7u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d8u8 = vld1_u8(src_ptr); - d9u8 = vld1_u8(src_ptr + 8); - d10u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - d11u8 = vld1_u8(src_ptr); - d12u8 = vld1_u8(src_ptr + 8); - d13u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - - q7u8 = vcombine_u8(d14u8, d15u8); - q8u8 = vcombine_u8(d16u8, d17u8); - q9u8 = vcombine_u8(d18u8, d19u8); - q10u8 = vcombine_u8(d20u8, d21u8); - - vst1q_u8((uint8_t *)tmpp, q7u8); - tmpp += 16; - vst1q_u8((uint8_t *)tmpp, q8u8); - tmpp += 16; - vst1q_u8((uint8_t *)tmpp, q9u8); - tmpp += 16; - vst1q_u8((uint8_t *)tmpp, q10u8); - tmpp += 16; - } - - // First-pass filtering for rest 5 lines - d14u8 = vld1_u8(src_ptr); - d15u8 = vld1_u8(src_ptr + 8); - d16u8 = vld1_u8(src_ptr + 16); - src_ptr += src_pixels_per_line; - - q9u16 = vmull_u8(d2u8, d0u8); - q10u16 = vmull_u8(d3u8, d0u8); - q11u16 = vmull_u8(d5u8, d0u8); - q12u16 = vmull_u8(d6u8, d0u8); - q13u16 = vmull_u8(d8u8, d0u8); - q14u16 = vmull_u8(d9u8, d0u8); - - d2u8 = vext_u8(d2u8, d3u8, 1); - d5u8 = vext_u8(d5u8, d6u8, 1); - d8u8 = vext_u8(d8u8, d9u8, 1); - - q9u16 = vmlal_u8(q9u16, d2u8, d1u8); - q11u16 = vmlal_u8(q11u16, d5u8, d1u8); - q13u16 = vmlal_u8(q13u16, d8u8, d1u8); - - d3u8 = vext_u8(d3u8, d4u8, 1); - d6u8 = vext_u8(d6u8, d7u8, 1); - d9u8 = vext_u8(d9u8, d10u8, 1); - - q10u16 = vmlal_u8(q10u16, d3u8, d1u8); - q12u16 = vmlal_u8(q12u16, d6u8, d1u8); - q14u16 = vmlal_u8(q14u16, d9u8, d1u8); - - q1u16 = vmull_u8(d11u8, d0u8); - q2u16 = vmull_u8(d12u8, d0u8); - q3u16 = vmull_u8(d14u8, d0u8); - q4u16 = vmull_u8(d15u8, d0u8); - - d11u8 = vext_u8(d11u8, d12u8, 1); - d14u8 = vext_u8(d14u8, d15u8, 1); - - q1u16 = vmlal_u8(q1u16, d11u8, d1u8); - q3u16 = vmlal_u8(q3u16, d14u8, d1u8); - - d12u8 = vext_u8(d12u8, d13u8, 1); - d15u8 = vext_u8(d15u8, d16u8, 1); - - q2u16 = vmlal_u8(q2u16, d12u8, d1u8); - q4u16 = vmlal_u8(q4u16, d15u8, d1u8); - - d10u8 = vqrshrn_n_u16(q9u16, 7); - d11u8 = vqrshrn_n_u16(q10u16, 7); - d12u8 = vqrshrn_n_u16(q11u16, 7); - d13u8 = vqrshrn_n_u16(q12u16, 7); - d14u8 = vqrshrn_n_u16(q13u16, 7); - d15u8 = vqrshrn_n_u16(q14u16, 7); - d16u8 = vqrshrn_n_u16(q1u16, 7); - d17u8 = vqrshrn_n_u16(q2u16, 7); - d18u8 = vqrshrn_n_u16(q3u16, 7); - d19u8 = vqrshrn_n_u16(q4u16, 7); - - q5u8 = vcombine_u8(d10u8, d11u8); - q6u8 = vcombine_u8(d12u8, d13u8); - q7u8 = vcombine_u8(d14u8, d15u8); - q8u8 = vcombine_u8(d16u8, d17u8); - q9u8 = vcombine_u8(d18u8, d19u8); - - vst1q_u8((uint8_t *)tmpp, q5u8); - tmpp += 16; - vst1q_u8((uint8_t *)tmpp, q6u8); - tmpp += 16; - vst1q_u8((uint8_t *)tmpp, q7u8); - tmpp += 16; - vst1q_u8((uint8_t *)tmpp, q8u8); - tmpp += 16; - vst1q_u8((uint8_t *)tmpp, q9u8); - - // secondpass_filter - d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]); - d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]); - - tmpp = tmp; - tmpp2 = tmpp + 272; - q11u8 = vld1q_u8(tmpp); - tmpp += 16; - for (i = 4; i > 0; i--) { - q12u8 = vld1q_u8(tmpp); - tmpp += 16; - q13u8 = vld1q_u8(tmpp); - tmpp += 16; - q14u8 = vld1q_u8(tmpp); - tmpp += 16; - q15u8 = vld1q_u8(tmpp); - tmpp += 16; - - q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8); - q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8); - q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8); - q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8); - q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8); - q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8); - q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8); - q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8); - - q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8); - q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8); - q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8); - q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8); - q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8); - q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8); - q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8); - q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8); - - d2u8 = vqrshrn_n_u16(q1u16, 7); - d3u8 = vqrshrn_n_u16(q2u16, 7); - d4u8 = vqrshrn_n_u16(q3u16, 7); - d5u8 = vqrshrn_n_u16(q4u16, 7); - d6u8 = vqrshrn_n_u16(q5u16, 7); - d7u8 = vqrshrn_n_u16(q6u16, 7); - d8u8 = vqrshrn_n_u16(q7u16, 7); - d9u8 = vqrshrn_n_u16(q8u16, 7); - - q1u8 = vcombine_u8(d2u8, d3u8); - q2u8 = vcombine_u8(d4u8, d5u8); - q3u8 = vcombine_u8(d6u8, d7u8); - q4u8 = vcombine_u8(d8u8, d9u8); - - q11u8 = q15u8; - - vst1q_u8((uint8_t *)tmpp2, q1u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q2u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q3u8); - tmpp2 += 16; - vst1q_u8((uint8_t *)tmpp2, q4u8); - tmpp2 += 16; - } - } - - // sub_pixel_variance16x16_neon - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - tmpp = tmp + 272; - for (i = 0; i < 8; i++) { // sub_pixel_variance16x16_neon_loop - q0u8 = vld1q_u8(tmpp); - tmpp += 16; - q1u8 = vld1q_u8(tmpp); - tmpp += 16; - q2u8 = vld1q_u8(dst_ptr); - dst_ptr += dst_pixels_per_line; - q3u8 = vld1q_u8(dst_ptr); - dst_ptr += dst_pixels_per_line; - - d0u8 = vget_low_u8(q0u8); - d1u8 = vget_high_u8(q0u8); - d2u8 = vget_low_u8(q1u8); - d3u8 = vget_high_u8(q1u8); - - q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8)); - q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8)); - q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8)); - q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vget_low_s64(q0s64); - d1s64 = vget_high_s64(q0s64); - d2s64 = vget_low_s64(q1s64); - d3s64 = vget_high_s64(q1s64); - d0s64 = vadd_s64(d0s64, d1s64); - d1s64 = vadd_s64(d2s64, d3s64); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance_halfpixvar16x16_h_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64, d2s64, d3s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8; - uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8; - uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon - q0u8 = vld1q_u8(src_ptr); - q1u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - q2u8 = vld1q_u8(src_ptr); - q3u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - q4u8 = vld1q_u8(src_ptr); - q5u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - q6u8 = vld1q_u8(src_ptr); - q7u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - - q11u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q12u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q13u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q14u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q1u8 = vextq_u8(q0u8, q1u8, 1); - q3u8 = vextq_u8(q2u8, q3u8, 1); - q5u8 = vextq_u8(q4u8, q5u8, 1); - q7u8 = vextq_u8(q6u8, q7u8, 1); - - q0u8 = vrhaddq_u8(q0u8, q1u8); - q1u8 = vrhaddq_u8(q2u8, q3u8); - q2u8 = vrhaddq_u8(q4u8, q5u8); - q3u8 = vrhaddq_u8(q6u8, q7u8); - - d0u8 = vget_low_u8(q0u8); - d1u8 = vget_high_u8(q0u8); - d2u8 = vget_low_u8(q1u8); - d3u8 = vget_high_u8(q1u8); - d4u8 = vget_low_u8(q2u8); - d5u8 = vget_high_u8(q2u8); - d6u8 = vget_low_u8(q3u8); - d7u8 = vget_high_u8(q3u8); - - q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8)); - q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8)); - q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8)); - q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8)); - q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8)); - q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8)); - q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8)); - q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8)); - - d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16)); - d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16)); - q9s32 = vmlal_s16(q9s32, d8s16, d8s16); - q10s32 = vmlal_s16(q10s32, d9s16, d9s16); - d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); - d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16)); - q9s32 = vmlal_s16(q9s32, d10s16, d10s16); - q10s32 = vmlal_s16(q10s32, d11s16, d11s16); - d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); - d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16)); - q9s32 = vmlal_s16(q9s32, d12s16, d12s16); - q10s32 = vmlal_s16(q10s32, d13s16, d13s16); - d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16)); - d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16)); - q9s32 = vmlal_s16(q9s32, d14s16, d14s16); - q10s32 = vmlal_s16(q10s32, d15s16, d15s16); - d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); - d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); - q9s32 = vmlal_s16(q9s32, d0s16, d0s16); - q10s32 = vmlal_s16(q10s32, d1s16, d1s16); - d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); - d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); - q9s32 = vmlal_s16(q9s32, d2s16, d2s16); - q10s32 = vmlal_s16(q10s32, d3s16, d3s16); - d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); - d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); - q9s32 = vmlal_s16(q9s32, d4s16, d4s16); - q10s32 = vmlal_s16(q10s32, d5s16, d5s16); - d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); - d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); - q9s32 = vmlal_s16(q9s32, d6s16, d6s16); - q10s32 = vmlal_s16(q10s32, d7s16, d7s16); - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vget_low_s64(q0s64); - d1s64 = vget_high_s64(q0s64); - d2s64 = vget_low_s64(q1s64); - d3s64 = vget_high_s64(q1s64); - d0s64 = vadd_s64(d0s64, d1s64); - d1s64 = vadd_s64(d2s64, d3s64); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance_halfpixvar16x16_v_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8; - int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64, d2s64, d3s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8; - uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16; - int32x4_t q8s32, q9s32, q10s32; - int64x2_t q0s64, q1s64, q5s64; - - q8s32 = vdupq_n_s32(0); - q9s32 = vdupq_n_s32(0); - q10s32 = vdupq_n_s32(0); - - q0u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon - q2u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q4u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q6u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - q15u8 = vld1q_u8(src_ptr); - src_ptr += source_stride; - - q1u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q3u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q5u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q7u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - q0u8 = vrhaddq_u8(q0u8, q2u8); - q2u8 = vrhaddq_u8(q2u8, q4u8); - q4u8 = vrhaddq_u8(q4u8, q6u8); - q6u8 = vrhaddq_u8(q6u8, q15u8); - - d0u8 = vget_low_u8(q0u8); - d1u8 = vget_high_u8(q0u8); - d4u8 = vget_low_u8(q2u8); - d5u8 = vget_high_u8(q2u8); - d8u8 = vget_low_u8(q4u8); - d9u8 = vget_high_u8(q4u8); - d12u8 = vget_low_u8(q6u8); - d13u8 = vget_high_u8(q6u8); - - q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8)); - q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8)); - q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8)); - q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8)); - q0u16 = vsubl_u8(d8u8, vget_low_u8(q5u8)); - q1u16 = vsubl_u8(d9u8, vget_high_u8(q5u8)); - q2u16 = vsubl_u8(d12u8, vget_low_u8(q7u8)); - q3u16 = vsubl_u8(d13u8, vget_high_u8(q7u8)); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); - q9s32 = vmlal_s16(q9s32, d22s16, d22s16); - q10s32 = vmlal_s16(q10s32, d23s16, d23s16); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); - q9s32 = vmlal_s16(q9s32, d24s16, d24s16); - q10s32 = vmlal_s16(q10s32, d25s16, d25s16); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); - q9s32 = vmlal_s16(q9s32, d26s16, d26s16); - q10s32 = vmlal_s16(q10s32, d27s16, d27s16); - d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); - d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); - q9s32 = vmlal_s16(q9s32, d28s16, d28s16); - q10s32 = vmlal_s16(q10s32, d29s16, d29s16); - d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); - d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16)); - q9s32 = vmlal_s16(q9s32, d0s16, d0s16); - q10s32 = vmlal_s16(q10s32, d1s16, d1s16); - d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); - d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16)); - q9s32 = vmlal_s16(q9s32, d2s16, d2s16); - q10s32 = vmlal_s16(q10s32, d3s16, d3s16); - d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16)); - d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16)); - q9s32 = vmlal_s16(q9s32, d4s16, d4s16); - q10s32 = vmlal_s16(q10s32, d5s16, d5s16); - d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16)); - d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16)); - q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16)); - q9s32 = vmlal_s16(q9s32, d6s16, d6s16); - q10s32 = vmlal_s16(q10s32, d7s16, d7s16); - - q0u8 = q15u8; - } - - q10s32 = vaddq_s32(q10s32, q9s32); - q0s64 = vpaddlq_s32(q8s32); - q1s64 = vpaddlq_s32(q10s32); - - d0s64 = vget_low_s64(q0s64); - d1s64 = vget_high_s64(q0s64); - d2s64 = vget_low_s64(q1s64); - d3s64 = vget_high_s64(q1s64); - d0s64 = vadd_s64(d0s64, d1s64); - d1s64 = vadd_s64(d2s64, d3s64); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -unsigned int vp8_variance_halfpixvar16x16_hv_neon( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - int i; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16; - int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16; - uint32x2_t d0u32, d10u32; - int64x1_t d0s64, d1s64, d2s64, d3s64; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8; - uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16; - int32x4_t q13s32, q14s32, q15s32; - int64x2_t q0s64, q1s64, q5s64; - - q13s32 = vdupq_n_s32(0); - q14s32 = vdupq_n_s32(0); - q15s32 = vdupq_n_s32(0); - - q0u8 = vld1q_u8(src_ptr); - q1u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - q1u8 = vextq_u8(q0u8, q1u8, 1); - q0u8 = vrhaddq_u8(q0u8, q1u8); - for (i = 0; i < 4; i++) { // vp8_filt_fpo16x16s_4_0_loop_neon - q2u8 = vld1q_u8(src_ptr); - q3u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - q4u8 = vld1q_u8(src_ptr); - q5u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - q6u8 = vld1q_u8(src_ptr); - q7u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - q8u8 = vld1q_u8(src_ptr); - q9u8 = vld1q_u8(src_ptr + 16); - src_ptr += source_stride; - - q3u8 = vextq_u8(q2u8, q3u8, 1); - q5u8 = vextq_u8(q4u8, q5u8, 1); - q7u8 = vextq_u8(q6u8, q7u8, 1); - q9u8 = vextq_u8(q8u8, q9u8, 1); - - q1u8 = vrhaddq_u8(q2u8, q3u8); - q2u8 = vrhaddq_u8(q4u8, q5u8); - q3u8 = vrhaddq_u8(q6u8, q7u8); - q4u8 = vrhaddq_u8(q8u8, q9u8); - q0u8 = vrhaddq_u8(q0u8, q1u8); - q1u8 = vrhaddq_u8(q1u8, q2u8); - q2u8 = vrhaddq_u8(q2u8, q3u8); - q3u8 = vrhaddq_u8(q3u8, q4u8); - - q5u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q6u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q7u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - q8u8 = vld1q_u8(ref_ptr); - ref_ptr += recon_stride; - - d0u8 = vget_low_u8(q0u8); - d1u8 = vget_high_u8(q0u8); - d2u8 = vget_low_u8(q1u8); - d3u8 = vget_high_u8(q1u8); - d4u8 = vget_low_u8(q2u8); - d5u8 = vget_high_u8(q2u8); - d6u8 = vget_low_u8(q3u8); - d7u8 = vget_high_u8(q3u8); - - q9u16 = vsubl_u8(d0u8, vget_low_u8(q5u8)); - q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8)); - q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8)); - q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8)); - q0u16 = vsubl_u8(d4u8, vget_low_u8(q7u8)); - q1u16 = vsubl_u8(d5u8, vget_high_u8(q7u8)); - q5u16 = vsubl_u8(d6u8, vget_low_u8(q8u8)); - q6u16 = vsubl_u8(d7u8, vget_high_u8(q8u8)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16)); - q14s32 = vmlal_s16(q14s32, d18s16, d18s16); - q15s32 = vmlal_s16(q15s32, d19s16, d19s16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16)); - q14s32 = vmlal_s16(q14s32, d20s16, d20s16); - q15s32 = vmlal_s16(q15s32, d21s16, d21s16); - - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16)); - q14s32 = vmlal_s16(q14s32, d22s16, d22s16); - q15s32 = vmlal_s16(q15s32, d23s16, d23s16); - - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16)); - q14s32 = vmlal_s16(q14s32, d24s16, d24s16); - q15s32 = vmlal_s16(q15s32, d25s16, d25s16); - - d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16)); - d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16)); - q14s32 = vmlal_s16(q14s32, d0s16, d0s16); - q15s32 = vmlal_s16(q15s32, d1s16, d1s16); - - d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16)); - d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16)); - q14s32 = vmlal_s16(q14s32, d2s16, d2s16); - q15s32 = vmlal_s16(q15s32, d3s16, d3s16); - - d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16)); - d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16)); - q14s32 = vmlal_s16(q14s32, d10s16, d10s16); - q15s32 = vmlal_s16(q15s32, d11s16, d11s16); - - d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16)); - d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16)); - q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16)); - q14s32 = vmlal_s16(q14s32, d12s16, d12s16); - q15s32 = vmlal_s16(q15s32, d13s16, d13s16); - - q0u8 = q4u8; - } - - q15s32 = vaddq_s32(q14s32, q15s32); - q0s64 = vpaddlq_s32(q13s32); - q1s64 = vpaddlq_s32(q15s32); - - d0s64 = vget_low_s64(q0s64); - d1s64 = vget_high_s64(q0s64); - d2s64 = vget_low_s64(q1s64); - d3s64 = vget_high_s64(q1s64); - d0s64 = vadd_s64(d0s64, d1s64); - d1s64 = vadd_s64(d2s64, d3s64); - - q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), - vreinterpret_s32_s64(d0s64)); - vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); - - d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8); - d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); - - return vget_lane_u32(d0u32, 0); -} - -#define FILTER_BITS 7 - -static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { - const int32x4_t a = vpaddlq_s16(v_16x8); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { - const int64x2_t b = vpaddlq_s32(v_32x4); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - return vget_lane_s32(c, 0); -} - -static void variance_neon_w8(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { - int i, j; - int16x8_t v_sum = vdupq_n_s16(0); - int32x4_t v_sse_lo = vdupq_n_s32(0); - int32x4_t v_sse_hi = vdupq_n_s32(0); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - const uint8x8_t v_a = vld1_u8(&a[j]); - const uint8x8_t v_b = vld1_u8(&b[j]); - const uint16x8_t v_diff = vsubl_u8(v_a, v_b); - const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); - v_sum = vaddq_s16(v_sum, sv_diff); - v_sse_lo = vmlal_s16(v_sse_lo, - vget_low_s16(sv_diff), - vget_low_s16(sv_diff)); - v_sse_hi = vmlal_s16(v_sse_hi, - vget_high_s16(sv_diff), - vget_high_s16(sv_diff)); - } - a += a_stride; - b += b_stride; - } - - *sum = horizontal_add_s16x8(v_sum); - *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); -} - -static unsigned int variance8x8_neon(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse) { - int sum; - variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); - return *sse - (((int64_t)sum * sum) / (8 * 8)); -} - -static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *vpx_filter) { - const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]); - const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]); - unsigned int i; - for (i = 0; i < output_height; ++i) { - const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); - const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); - const uint16x8_t a = vmull_u8(src_0, f0); - const uint16x8_t b = vmlal_u8(a, src_1, f1); - const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); - vst1_u8(&output_ptr[0], out); - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } -} - -unsigned int vp8_sub_pixel_variance8x8_neon( - const unsigned char *src, - int src_stride, - int xoffset, - int yoffset, - const unsigned char *dst, - int dst_stride, - unsigned int *sse) { - DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]); - DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); - if (xoffset == 0) { - var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8, - 8, bilinear_taps_coeff[yoffset]); - } else if (yoffset == 0) { - var_filter_block2d_bil_w8(src, temp2, src_stride, 1, - 9, 8, - bilinear_taps_coeff[xoffset]); - } else { - var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, - 9, 8, - bilinear_taps_coeff[xoffset]); - var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, - 8, bilinear_taps_coeff[yoffset]); - } - return variance8x8_neon(temp2, 8, dst, dst_stride, sse); -} diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c deleted file mode 100644 index 0f293f03d..000000000 --- a/vp8/common/arm/variance_arm.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "./vp8_rtcd.h" -#include "./vpx_dsp_rtcd.h" -#include "vp8/common/variance.h" -#include "vp8/common/filter.h" - -// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder -#if CONFIG_VP8_ENCODER - -#if HAVE_MEDIA -#include "vp8/common/arm/bilinearfilter_arm.h" - -unsigned int vp8_sub_pixel_variance8x8_armv6 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - unsigned short first_pass[10*8]; - unsigned char second_pass[8*8]; - const short *HFilter, *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, - src_pixels_per_line, - 9, 8, HFilter); - vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, - 8, 8, 8, VFilter); - - return vpx_variance8x8_media(second_pass, 8, dst_ptr, - dst_pixels_per_line, sse); -} - -unsigned int vp8_sub_pixel_variance16x16_armv6 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - unsigned short first_pass[36*16]; - unsigned char second_pass[20*16]; - const short *HFilter, *VFilter; - unsigned int var; - - if (xoffset == 4 && yoffset == 0) - { - var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } - else if (xoffset == 0 && yoffset == 4) - { - var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } - else if (xoffset == 4 && yoffset == 4) - { - var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, sse); - } - else - { - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass, - src_pixels_per_line, - 17, 16, HFilter); - vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass, - 16, 16, 16, VFilter); - - var = vpx_variance16x16_media(second_pass, 16, dst_ptr, - dst_pixels_per_line, sse); - } - return var; -} - -#endif // HAVE_MEDIA - - -#if HAVE_NEON - -extern unsigned int vp8_sub_pixel_variance16x16_neon_func -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -); - -unsigned int vp8_sub_pixel_variance16x16_neon -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - if (xoffset == 4 && yoffset == 0) - return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == 0 && yoffset == 4) - return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else if (xoffset == 4 && yoffset == 4) - return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - else - return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); -} - -#endif // HAVE_NEON -#endif // CONFIG_VP8_ENCODER diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c index 5c0680f42..2bfefb126 100644 --- a/vp8/common/mfqe.c +++ b/vp8/common/mfqe.c @@ -20,7 +20,7 @@ #include "./vp8_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "vp8/common/postproc.h" -#include "vp8/common/variance.h" +#include "vpx_dsp/variance.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/yv12config.h" diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index ea2457725..db3822278 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -237,47 +237,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, specialize qw/vp8_bilinear_predict4x4 mmx media/; $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; -# -# Sub-pixel Variance -# -add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; -specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/; -$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt; - -add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; -specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/; -$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt; -$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6; - -add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; -specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/; -$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt; - -add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; -specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/; -$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt; - -add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char *src_ptr, int source_stride, int xoffset, int yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"; -specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/; -$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt; -$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6; -$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon; - -add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/; -$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt; -$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6; - -add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/; -$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt; -$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6; - -add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/; -$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt; -$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6; - # # Encoder functions below this point. # diff --git a/vp8/common/variance.h b/vp8/common/variance.h deleted file mode 100644 index c6c9f41bf..000000000 --- a/vp8/common/variance.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP8_COMMON_VARIANCE_H_ -#define VP8_COMMON_VARIANCE_H_ - -#include "vpx_config.h" - -#include "vpx/vpx_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef unsigned int(*vpx_sad_fn_t)( - const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride); - -typedef void (*vp8_copy32xn_fn_t)( - const unsigned char *src_ptr, - int source_stride, - unsigned char *ref_ptr, - int ref_stride, - int n); - -typedef void (*vpx_sad_multi_fn_t)( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_array, - int ref_stride, - unsigned int *sad_array); - -typedef void (*vpx_sad_multi_d_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char * const ref_array[], - int ref_stride, - unsigned int *sad_array - ); - -typedef unsigned int (*vpx_variance_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int ref_stride, - unsigned int *sse - ); - -typedef unsigned int (*vp8_subpixvariance_fn_t) - ( - const unsigned char *src_ptr, - int source_stride, - int xoffset, - int yoffset, - const unsigned char *ref_ptr, - int Refstride, - unsigned int *sse - ); - -typedef struct variance_vtable -{ - vpx_sad_fn_t sdf; - vpx_variance_fn_t vf; - vp8_subpixvariance_fn_t svf; - vpx_variance_fn_t svf_halfpix_h; - vpx_variance_fn_t svf_halfpix_v; - vpx_variance_fn_t svf_halfpix_hv; - vpx_sad_multi_fn_t sdx3f; - vpx_sad_multi_fn_t sdx8f; - vpx_sad_multi_d_fn_t sdx4df; -#if ARCH_X86 || ARCH_X86_64 - vp8_copy32xn_fn_t copymem; -#endif -} vp8_variance_fn_ptr_t; - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP8_COMMON_VARIANCE_H_ diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c deleted file mode 100644 index 02915a4de..000000000 --- a/vp8/common/variance_c.c +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp8_rtcd.h" -#include "filter.h" -#include "variance.h" - -/* This is a bad idea. - * ctz = count trailing zeros */ -static int ctz(int a) { - int b = 0; - while (a != 1) { - a >>= 1; - b++; - } - return b; -} - -static unsigned int variance( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse) -{ - int i, j; - int diff, sum; - - sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) - { - for (j = 0; j < w; j++) - { - diff = src_ptr[j] - ref_ptr[j]; - sum += diff; - *sse += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h))))); -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : UINT8 *src_ptr : Pointer to source block. - * UINT32 src_pixels_per_line : Stride of input block. - * UINT32 pixel_step : Offset between filter input samples (see notes). - * UINT32 output_height : Input block height. - * UINT32 output_width : Input block width. - * INT32 *vp8_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : INT32 *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement first-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. - * Two filter taps should sum to VP8_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_first_pass -( - const unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -) -{ - unsigned int i, j; - - for (i = 0; i < output_height; i++) - { - for (j = 0; j < output_width; j++) - { - /* Apply bilinear filter */ - output_ptr[j] = (((int)src_ptr[0] * vp8_filter[0]) + - ((int)src_ptr[pixel_step] * vp8_filter[1]) + - (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; - src_ptr++; - } - - /* Next row... */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : INT32 *src_ptr : Pointer to source block. - * UINT32 src_pixels_per_line : Stride of input block. - * UINT32 pixel_step : Offset between filter input samples (see notes). - * UINT32 output_height : Input block height. - * UINT32 output_width : Input block width. - * INT32 *vp8_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement second-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP8_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_second_pass -( - const unsigned short *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp8_filter -) -{ - unsigned int i, j; - int Temp; - - for (i = 0; i < output_height; i++) - { - for (j = 0; j < output_width; j++) - { - /* Apply filter */ - Temp = ((int)src_ptr[0] * vp8_filter[0]) + - ((int)src_ptr[pixel_step] * vp8_filter[1]) + - (VP8_FILTER_WEIGHT / 2); - output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - - -unsigned int vp8_sub_pixel_variance4x4_c -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - unsigned char temp2[20*16]; - const short *HFilter, *VFilter; - unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */ - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - /* First filter 1d Horizontal */ - var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter); - - /* Now filter Verticaly */ - var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter); - - return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse); -} - - -unsigned int vp8_sub_pixel_variance8x8_c -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */ - unsigned char temp2[20*16]; - const short *HFilter, *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); - var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); - - return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse); -} - -unsigned int vp8_sub_pixel_variance16x16_c -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - unsigned short FData3[17*16]; /* Temp data bufffer used in filtering */ - unsigned char temp2[20*16]; - const short *HFilter, *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); - var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); - - return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse); -} - - -unsigned int vp8_variance_halfpixvar16x16_h_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp8_variance_halfpixvar16x16_v_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp8_variance_halfpixvar16x16_hv_c( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp8_sub_pixel_variance16x8_c -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - unsigned short FData3[16*9]; /* Temp data bufffer used in filtering */ - unsigned char temp2[20*16]; - const short *HFilter, *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); - var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); - - return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse); -} - -unsigned int vp8_sub_pixel_variance8x16_c -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - unsigned short FData3[9*16]; /* Temp data bufffer used in filtering */ - unsigned char temp2[20*16]; - const short *HFilter, *VFilter; - - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - - var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); - var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter); - - return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse); -} diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm deleted file mode 100644 index 26de5e860..000000000 --- a/vp8/common/x86/variance_impl_sse2.asm +++ /dev/null @@ -1,972 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define xmm_filter_shift 7 - -;void vp8_filter_block2d_bil_var_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE -sym(vp8_filter_block2d_bil_var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - push rbx - ; end prolog - - pxor xmm6, xmm6 ; - pxor xmm7, xmm7 ; - - lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding - movdqa xmm4, XMMWORD PTR [rsi] - - lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je filter_block2d_bil_var_sse2_sp_only - - shl rax, 5 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je filter_block2d_bil_var_sse2_fp_only - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - movdqa xmm5, xmm1 - - movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line - lea rsi, [rsi + rbx] -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -filter_block2d_bil_var_sse2_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movdqa xmm3, xmm5 ; - movdqa xmm5, xmm1 ; - - pmullw xmm3, [rdx] ; - pmullw xmm1, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rbx] ;ref_pixels_per_line -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 ; - jnz filter_block2d_bil_var_sse2_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 - je filter_block2d_bil_var_sse2_full_pixel - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + rax] - -filter_block2d_bil_sp_only_loop: - movq xmm3, QWORD PTR [rsi] ; - punpcklbw xmm3, xmm0 ; - movdqa xmm5, xmm3 - - pmullw xmm1, [rdx] ; - pmullw xmm3, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - movdqa xmm1, xmm5 ; - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_sp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 ; - -filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movq xmm2, QWORD PTR [rdi] ; - punpcklbw xmm2, xmm0 ; - - psubw xmm1, xmm2 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_full_pixel_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - -filter_block2d_bil_fp_only_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - lea rsi, [rsi + rdx] - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_fp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_variance: - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(7) ; sum - mov rdi, arg(8) ; sumsquared - - movd [rsi], mm2 ; xsum - movd [rdi], mm4 ; xxsum - - ; begin epilog - pop rbx - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_half_horiz_vert_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE -sym(vp8_half_horiz_vert_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source -%else - add rsi, r8 -%endif - -vp8_half_horiz_vert_variance8x_h_1: - - movq xmm1, QWORD PTR [rsi] ; - movq xmm2, QWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - - sub rcx, 1 ; - jnz vp8_half_horiz_vert_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_half_horiz_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE -sym(vp8_half_horiz_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - - movdqu xmm5, XMMWORD PTR [rsi] - movdqu xmm3, XMMWORD PTR [rsi+1] - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - - lea rsi, [rsi + rax] - -vp8_half_horiz_vert_variance16x_h_1: - movdqu xmm1, XMMWORD PTR [rsi] ; - movdqu xmm2, XMMWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm4, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - psubw xmm5, xmm3 ; xmm5 -= xmm3 - - movq xmm3, QWORD PTR [rdi+8] - punpcklbw xmm3, xmm0 - psubw xmm4, xmm3 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz vp8_half_horiz_vert_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_half_vert_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE -sym(vp8_half_vert_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; -vp8_half_vert_variance8x_h_1: - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - - sub rcx, 1 ; - jnz vp8_half_vert_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_half_vert_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE -sym(vp8_half_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr - - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - movdqu xmm5, XMMWORD PTR [rsi] - lea rsi, [rsi + rax ] - pxor xmm0, xmm0 - -vp8_half_vert_variance16x_h_1: - movdqu xmm3, XMMWORD PTR [rsi] - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 - punpckhbw xmm4, xmm0 - - movq xmm2, QWORD PTR [rdi] - punpcklbw xmm2, xmm0 - psubw xmm5, xmm2 - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - psubw xmm4, xmm2 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm3 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 - jnz vp8_half_vert_variance16x_h_1 - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_half_horiz_variance8x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE -sym(vp8_half_horiz_variance8x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor xmm0, xmm0 ; -vp8_half_horiz_variance8x_h_1: - movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 - movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - punpcklbw xmm5, xmm0 ; xmm5 = words of above - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - -%if ABI_IS_32BIT - add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source - add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination -%else - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz vp8_half_horiz_variance8x_h_1 ; - - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(5) ; sum - mov rdi, arg(6) ; sumsquared - - movd [rsi], mm2 ; - movd [rdi], mm4 ; - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp8_half_horiz_variance16x_h_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE -sym(vp8_half_horiz_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - - pxor xmm0, xmm0 ; - -vp8_half_horiz_variance16x_h_1: - movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 - movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm1, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm1, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - psubw xmm1, xmm2 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm1 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm1, xmm1 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz vp8_half_horiz_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -vp8_bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/vp8/common/x86/variance_impl_ssse3.asm b/vp8/common/x86/variance_impl_ssse3.asm deleted file mode 100644 index 686b4a902..000000000 --- a/vp8/common/x86/variance_impl_ssse3.asm +++ /dev/null @@ -1,364 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define xmm_filter_shift 7 - - -;void vp8_filter_block2d_bil_var_ssse3 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -;Note: The filter coefficient at offset=0 is 128. Since the second register -;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. -global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE -sym(vp8_filter_block2d_bil_var_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 - pxor xmm7, xmm7 - - lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .filter_block2d_bil_var_ssse3_sp_only - - shl rax, 4 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je .filter_block2d_bil_var_ssse3_fp_only - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi+1] - movdqa xmm2, xmm0 - - punpcklbw xmm0, xmm1 - punpckhbw xmm2, xmm1 - pmaddubsw xmm0, [rax] - pmaddubsw xmm2, [rax] - - paddw xmm0, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm0, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - packuswb xmm0, xmm2 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + r8] -%endif - -.filter_block2d_bil_var_ssse3_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - packuswb xmm1, xmm3 - - movdqa xmm2, xmm0 - movdqa xmm0, xmm1 - movdqa xmm3, xmm2 - - punpcklbw xmm2, xmm1 - punpckhbw xmm3, xmm1 - pmaddubsw xmm2, [rdx] - pmaddubsw xmm3, [rdx] - - paddw xmm2, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm2, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm1, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm1, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm2, xmm1 - psubw xmm3, xmm5 - paddw xmm6, xmm2 - paddw xmm6, xmm3 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm2 - paddd xmm7, xmm3 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rsi, [rsi + r8] - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_var_ssse3_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; Both xoffset =0 and yoffset=0 - je .filter_block2d_bil_var_ssse3_full_pixel - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - movdqu xmm1, XMMWORD PTR [rsi] - movdqa xmm0, xmm1 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - lea rsi, [rsi + rax] - -.filter_block2d_bil_sp_only_loop: - movdqu xmm3, XMMWORD PTR [rsi] - movdqa xmm2, xmm1 - movdqa xmm0, xmm3 - - punpcklbw xmm1, xmm3 - punpckhbw xmm2, xmm3 - pmaddubsw xmm1, [rdx] - pmaddubsw xmm2, [rdx] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - movq xmm3, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm3, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm3 - psubw xmm2, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - movdqa xmm1, xmm0 - lea rsi, [rsi + rax] ;ref_pixels_per_line - -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_sp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 - -.filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] - punpcklbw xmm1, xmm0 - movq xmm2, QWORD PTR [rsi+8] - punpcklbw xmm2, xmm0 - - movq xmm3, QWORD PTR [rdi] - punpcklbw xmm3, xmm0 - movq xmm4, QWORD PTR [rdi+8] - punpcklbw xmm4, xmm0 - - psubw xmm1, xmm3 - psubw xmm2, xmm4 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rdx] ;src_pixels_per_line - sub rcx, 1 - jnz .filter_block2d_bil_full_pixel_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -.filter_block2d_bil_fp_only_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm2, XMMWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm2, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm2 - psubw xmm3, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm3 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm1 - paddd xmm7, xmm3 - - lea rsi, [rsi + rdx] -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_fp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_variance: - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(7) ;[Sum] - mov rdi, arg(8) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -vp8_bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 112, 16 - times 8 db 96, 32 - times 8 db 80, 48 - times 8 db 64, 64 - times 8 db 48, 80 - times 8 db 32, 96 - times 8 db 16, 112 diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c deleted file mode 100644 index 2a0df640a..000000000 --- a/vp8/common/x86/variance_ssse3.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp8_rtcd.h" -#include "vpx_config.h" -#include "vp8/common/variance.h" -#include "vpx_ports/mem.h" - -extern void vp8_half_horiz_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp8_half_horiz_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp8_half_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp8_filter_block2d_bil_var_ssse3 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); - -unsigned int vp8_sub_pixel_variance16x16_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum0; - unsigned int xxsum0; - - /* note we could avoid these if statements if the calling function - * just called the appropriate functions inside. - */ - if (xoffset == 4 && yoffset == 0) - { - vp8_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } - else if (xoffset == 0 && yoffset == 4) - { - vp8_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } - else if (xoffset == 4 && yoffset == 4) - { - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } - else - { - vp8_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - -unsigned int vp8_sub_pixel_variance16x8_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) -{ - int xsum0; - unsigned int xxsum0; - - if (xoffset == 4 && yoffset == 0) - { - vp8_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } - else if (xoffset == 0 && yoffset == 4) - { - vp8_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } - else if (xoffset == 4 && yoffset == 4) - { - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } - else - { - vp8_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} diff --git a/vp8/common/x86/vp8_variance_impl_mmx.asm b/vp8/common/x86/vp8_variance_impl_mmx.asm deleted file mode 100644 index 97f25275d..000000000 --- a/vp8/common/x86/vp8_variance_impl_mmx.asm +++ /dev/null @@ -1,353 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define mmx_filter_shift 7 - -;void vp8_filter_block2d_bil4x4_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE -sym(vp8_filter_block2d_bil4x4_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - - mov rax, arg(4) ;HFilter ; - mov rdx, arg(5) ;VFilter ; - - mov rsi, arg(0) ;ref_ptr ; - mov rdi, arg(2) ;src_ptr ; - - mov rcx, 4 ; - pxor mm0, mm0 ; - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm5, mm1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 -%endif - -.filter_block2d_bil4x4_var_mmx_loop: - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm3, mm5 ; - - movq mm5, mm1 ; - pmullw mm3, [rdx] ; - - pmullw mm1, [rdx+8] ; - paddw mm1, mm3 ; - - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - movd mm3, [rdi] ; - punpcklbw mm3, mm0 ; - - psubw mm1, mm3 ; - paddw mm6, mm1 ; - - pmaddwd mm1, mm1 ; - paddd mm7, mm1 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil4x4_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(6) ;sum - mov rsi, arg(7) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - - -;void vp8_filter_block2d_bil_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE -sym(vp8_filter_block2d_bil_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - mov rax, arg(5) ;HFilter ; - - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor mm0, mm0 ; - movq mm1, [rsi] ; - - movq mm3, [rsi+1] ; - movq mm2, mm1 ; - - movq mm4, mm3 ; - punpcklbw mm1, mm0 ; - - punpckhbw mm2, mm0 ; - pmullw mm1, [rax] ; - - pmullw mm2, [rax] ; - punpcklbw mm3, mm0 ; - - punpckhbw mm4, mm0 ; - pmullw mm3, [rax+8] ; - - pmullw mm4, [rax+8] ; - paddw mm1, mm3 ; - - paddw mm2, mm4 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm2, mmx_filter_shift ; - movq mm5, mm1 - - packuswb mm5, mm2 ; -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - add rsi, r8 -%endif - -.filter_block2d_bil_var_mmx_loop: - - movq mm1, [rsi] ; - movq mm3, [rsi+1] ; - - movq mm2, mm1 ; - movq mm4, mm3 ; - - punpcklbw mm1, mm0 ; - punpckhbw mm2, mm0 ; - - pmullw mm1, [rax] ; - pmullw mm2, [rax] ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - pmullw mm3, [rax+8] ; - pmullw mm4, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - psraw mm2, mmx_filter_shift ; - - movq mm3, mm5 ; - movq mm4, mm5 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - movq mm5, mm1 ; - packuswb mm5, mm2 ; - - pmullw mm3, [rdx] ; - pmullw mm4, [rdx] ; - - pmullw mm1, [rdx+8] ; - pmullw mm2, [rdx+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - psraw mm2, mmx_filter_shift ; - - movq mm3, [rdi] ; - movq mm4, mm3 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - psubw mm1, mm3 ; - psubw mm2, mm4 ; - - paddw mm6, mm1 ; - pmaddwd mm1, mm1 ; - - paddw mm6, mm2 ; - pmaddwd mm2, mm2 ; - - paddd mm7, mm1 ; - paddd mm7, mm2 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(7) ;sum - mov rsi, arg(8) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -;short mmx_bi_rd[4] = { 64, 64, 64, 64}; -align 16 -mmx_bi_rd: - times 4 dw 64 diff --git a/vp8/common/x86/vp8_variance_mmx.c b/vp8/common/x86/vp8_variance_mmx.c deleted file mode 100644 index e594b1e65..000000000 --- a/vp8/common/x86/vp8_variance_mmx.c +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp8_rtcd.h" -#include "vpx_config.h" -#include "vp8/common/variance.h" -#include "vpx_ports/mem.h" -#include "vp8/common/x86/filter_x86.h" - -extern void filter_block1d_h6_mmx -( - const unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *filter -); -extern void filter_block1d_v6_mmx -( - const short *src_ptr, - unsigned char *output_ptr, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *filter -); - -extern void vp8_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); -extern void vp8_filter_block2d_bil_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - -unsigned int vp8_sub_pixel_variance4x4_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) - -{ - int xsum; - unsigned int xxsum; - vp8_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); -} - - -unsigned int vp8_sub_pixel_variance8x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - - int xsum; - unsigned int xxsum; - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -unsigned int vp8_sub_pixel_variance16x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum0, &xxsum0 - ); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); - - -} - -unsigned int vp8_sub_pixel_variance16x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum0, &xxsum0 - ); - - - vp8_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp8_sub_pixel_variance8x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum; - unsigned int xxsum; - vp8_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); -} - - -unsigned int vp8_variance_halfpixvar16x16_h_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp8_variance_halfpixvar16x16_v_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp8_variance_halfpixvar16x16_hv_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) -{ - return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4, - ref_ptr, recon_stride, sse); -} diff --git a/vp8/common/x86/vp8_variance_sse2.c b/vp8/common/x86/vp8_variance_sse2.c deleted file mode 100644 index 1c15ed880..000000000 --- a/vp8/common/x86/vp8_variance_sse2.c +++ /dev/null @@ -1,403 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp8_rtcd.h" -#include "vpx_config.h" -#include "vp8/common/variance.h" -#include "vpx_ports/mem.h" -#include "vp8/common/x86/filter_x86.h" - -extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); -extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); -extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); -extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); - -extern void vp8_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - -void vp8_filter_block2d_bil_var_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); -void vp8_half_horiz_vert_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp8_half_horiz_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp8_half_horiz_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp8_half_horiz_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp8_half_vert_variance8x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -void vp8_half_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); - -unsigned int vp8_sub_pixel_variance4x4_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum; - unsigned int xxsum; - vp8_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); -} - - -unsigned int vp8_sub_pixel_variance8x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum; - unsigned int xxsum; - - if (xoffset == 4 && yoffset == 0) - { - vp8_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } - else if (xoffset == 0 && yoffset == 4) - { - vp8_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } - else if (xoffset == 4 && yoffset == 4) - { - vp8_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } - else - { - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -unsigned int vp8_sub_pixel_variance16x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - - /* note we could avoid these if statements if the calling function - * just called the appropriate functions inside. - */ - if (xoffset == 4 && yoffset == 0) - { - vp8_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } - else if (xoffset == 0 && yoffset == 4) - { - vp8_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } - else if (xoffset == 4 && yoffset == 4) - { - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } - else - { - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0 - ); - - vp8_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum1, &xxsum1 - ); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - -unsigned int vp8_sub_pixel_variance16x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) -{ - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - if (xoffset == 4 && yoffset == 0) - { - vp8_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } - else if (xoffset == 0 && yoffset == 4) - { - vp8_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } - else if (xoffset == 4 && yoffset == 4) - { - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } - else - { - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - - vp8_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum1, &xxsum1); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp8_sub_pixel_variance8x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) -{ - int xsum; - unsigned int xxsum; - - if (xoffset == 4 && yoffset == 0) - { - vp8_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } - else if (xoffset == 0 && yoffset == 4) - { - vp8_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } - else if (xoffset == 4 && yoffset == 4) - { - vp8_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } - else - { - vp8_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); -} - - -unsigned int vp8_variance_halfpixvar16x16_h_wmt( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) -{ - int xsum0; - unsigned int xxsum0; - - vp8_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - - -unsigned int vp8_variance_halfpixvar16x16_v_wmt( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) -{ - int xsum0; - unsigned int xxsum0; - vp8_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - - -unsigned int vp8_variance_halfpixvar16x16_hv_wmt( - const unsigned char *src_ptr, - int src_pixels_per_line, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) -{ - int xsum0; - unsigned int xxsum0; - - vp8_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 3deb4abb3..4c2acc774 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -16,7 +16,7 @@ #include "./vpx_scale_rtcd.h" #include "block.h" #include "onyx_int.h" -#include "vp8/common/variance.h" +#include "vpx_dsp/variance.h" #include "encodeintra.h" #include "vp8/common/setupintrarecon.h" #include "vp8/common/systemdependent.h" diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index f284f7c38..1694af819 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -13,7 +13,7 @@ #define VP8_ENCODER_MCOMP_H_ #include "block.h" -#include "vp8/common/variance.h" +#include "vpx_dsp/variance.h" #ifdef __cplusplus extern "C" { diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 40e29e191..c2a7ac4ce 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2132,17 +2132,17 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16; cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16; - cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16; - cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h; - cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v; - cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv; + cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16; + cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vpx_variance_halfpixvar16x16_h; + cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vpx_variance_halfpixvar16x16_v; + cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv; cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3; cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8; cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d; cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8; cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8; - cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8; + cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL; @@ -2152,7 +2152,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16; cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16; - cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16; + cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL; @@ -2162,7 +2162,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8; cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8; - cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8; + cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL; @@ -2172,7 +2172,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4; cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4; - cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4; + cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index c48e2f447..6fe8f235b 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -18,7 +18,7 @@ #include "treewriter.h" #include "tokenize.h" #include "vp8/common/onyxc_int.h" -#include "vp8/common/variance.h" +#include "vpx_dsp/variance.h" #include "encodemb.h" #include "quantize.h" #include "vp8/common/entropy.h" diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 053bf119a..f3443dbcc 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -22,7 +22,7 @@ #include "encodemb.h" #include "vp8/common/reconinter.h" #include "vp8/common/reconintra4x4.h" -#include "vp8/common/variance.h" +#include "vpx_dsp/variance.h" #include "mcomp.h" #include "rdopt.h" #include "vpx_mem/vpx_mem.h" diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 17194f0d4..edd6c58ec 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -29,7 +29,7 @@ #include "vp8/common/quant_common.h" #include "encodemb.h" #include "quantize.h" -#include "vp8/common/variance.h" +#include "vpx_dsp/variance.h" #include "mcomp.h" #include "rdopt.h" #include "vpx_mem/vpx_mem.h" @@ -500,9 +500,9 @@ int VP8_UVSSE(MACROBLOCK *x) if ((mv_row | mv_col) & 7) { - vp8_sub_pixel_variance8x8(uptr, pre_stride, + vpx_sub_pixel_variance8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2); - vp8_sub_pixel_variance8x8(vptr, pre_stride, + vpx_sub_pixel_variance8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1); sse2 += sse1; } diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 16833fda6..47ad97cb2 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -63,8 +63,6 @@ VP8_COMMON_SRCS-yes += common/reconintra.c VP8_COMMON_SRCS-yes += common/reconintra4x4.c VP8_COMMON_SRCS-yes += common/setupintrarecon.c VP8_COMMON_SRCS-yes += common/swapyv12buffer.c -VP8_COMMON_SRCS-yes += common/variance_c.c -VP8_COMMON_SRCS-yes += common/variance.h VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h @@ -86,8 +84,6 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c -VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm @@ -96,12 +92,8 @@ VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm -VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c -VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm -VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c -VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm @@ -129,7 +121,6 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c -VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/variance_arm.c # common (media) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.c @@ -149,9 +140,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) # common (neon intrinsics) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c @@ -170,6 +158,5 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/vp9/common/mips/msa/vp9_convolve_avg_msa.c b/vp9/common/mips/msa/vp9_convolve_avg_msa.c index eb8776078..7c11e4065 100644 --- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c +++ b/vp9/common/mips/msa/vp9_convolve_avg_msa.c @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { diff --git a/vp9/common/mips/msa/vp9_convolve_copy_msa.c b/vp9/common/mips/msa/vp9_convolve_copy_msa.c index 7a292c5ce..39a0b24d5 100644 --- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c +++ b/vp9/common/mips/msa/vp9_convolve_copy_msa.c @@ -9,7 +9,7 @@ */ #include -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height) { diff --git a/vp9/common/mips/msa/vp9_convolve_msa.h b/vp9/common/mips/msa/vp9_convolve_msa.h index 40fe94d3b..71c616b67 100644 --- a/vp9/common/mips/msa/vp9_convolve_msa.h +++ b/vp9/common/mips/msa/vp9_convolve_msa.h @@ -12,7 +12,7 @@ #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ #include "vp9/common/vp9_filter.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" extern const uint8_t mc_filt_mask_arr[16 * 3]; diff --git a/vp9/common/mips/msa/vp9_idct_msa.h b/vp9/common/mips/msa/vp9_idct_msa.h index 60e27fc11..c86e65ae9 100644 --- a/vp9/common/mips/msa/vp9_idct_msa.h +++ b/vp9/common/mips/msa/vp9_idct_msa.h @@ -13,7 +13,7 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_idct.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \ v8i16 k0_m = __msa_fill_h(cnst0); \ diff --git a/vp9/common/mips/msa/vp9_intra_predict_msa.c b/vp9/common/mips/msa/vp9_intra_predict_msa.c index 2fc610505..abf2704ca 100644 --- a/vp9/common/mips/msa/vp9_intra_predict_msa.c +++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c @@ -9,7 +9,7 @@ */ #include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \ out0 = __msa_subs_u_h(out0, in0); \ diff --git a/vp9/common/mips/msa/vp9_loopfilter_msa.h b/vp9/common/mips/msa/vp9_loopfilter_msa.h index 0643e41a5..bfbe8708f 100644 --- a/vp9/common/mips/msa/vp9_loopfilter_msa.h +++ b/vp9/common/mips/msa/vp9_loopfilter_msa.h @@ -11,7 +11,7 @@ #ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ #define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_ -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ p1_out, p0_out, q0_out, q1_out) { \ diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h deleted file mode 100644 index e008eafe9..000000000 --- a/vp9/common/mips/msa/vp9_macros_msa.h +++ /dev/null @@ -1,1885 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ -#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ - -#include - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" - -#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) -#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) - -#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) -#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) - -#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) -#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) - -#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) -#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) - -#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) - -#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) -#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) - -#if (__mips_isa_rev >= 6) -#define LH(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__ ( \ - "lh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) - -#define LW(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__ ( \ - "lw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) - -#if (__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__ ( \ - "ld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) -#else // !(__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m); \ - val1_m = LW(psrc_m + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ -}) -#endif // (__mips == 64) - -#define SH(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sh %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SW(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SD(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint64_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "sd %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} -#else // !(__mips_isa_rev >= 6) -#define LH(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint16_t val_m; \ - \ - __asm__ __volatile__ ( \ - "ulh %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) - -#define LW(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint32_t val_m; \ - \ - __asm__ __volatile__ ( \ - "ulw %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) - -#if (__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m = (const uint8_t *)(psrc); \ - uint64_t val_m = 0; \ - \ - __asm__ __volatile__ ( \ - "uld %[val_m], %[psrc_m] \n\t" \ - \ - : [val_m] "=r" (val_m) \ - : [psrc_m] "m" (*psrc_m) \ - ); \ - \ - val_m; \ -}) -#else // !(__mips == 64) -#define LD(psrc) ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ -}) -#endif // (__mips == 64) - -#define SH(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint16_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "ush %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SW(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - -#define SD(val, pdst) { \ - uint8_t *pdst_m1 = (uint8_t *)(pdst); \ - uint32_t val0_m, val1_m; \ - \ - val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF); \ - val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ - \ - SW(val0_m, pdst_m1); \ - SW(val1_m, pdst_m1 + 4); \ -} -#endif // (__mips_isa_rev >= 6) - -/* Description : Load 4 words with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1, out2, out3 - Details : Load word in 'out0' from (psrc) - Load word in 'out1' from (psrc + stride) - Load word in 'out2' from (psrc + 2 * stride) - Load word in 'out3' from (psrc + 3 * stride) -*/ -#define LW4(psrc, stride, out0, out1, out2, out3) { \ - out0 = LW((psrc)); \ - out1 = LW((psrc) + stride); \ - out2 = LW((psrc) + 2 * stride); \ - out3 = LW((psrc) + 3 * stride); \ -} - -/* Description : Load double words with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load double word in 'out0' from (psrc) - Load double word in 'out1' from (psrc + stride) -*/ -#define LD2(psrc, stride, out0, out1) { \ - out0 = LD((psrc)); \ - out1 = LD((psrc) + stride); \ -} -#define LD4(psrc, stride, out0, out1, out2, out3) { \ - LD2((psrc), stride, out0, out1); \ - LD2((psrc) + 2 * stride, stride, out2, out3); \ -} - -/* Description : Store 4 words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Store word from 'in0' to (pdst) - Store word from 'in1' to (pdst + stride) - Store word from 'in2' to (pdst + 2 * stride) - Store word from 'in3' to (pdst + 3 * stride) -*/ -#define SW4(in0, in1, in2, in3, pdst, stride) { \ - SW(in0, (pdst)) \ - SW(in1, (pdst) + stride); \ - SW(in2, (pdst) + 2 * stride); \ - SW(in3, (pdst) + 3 * stride); \ -} - -/* Description : Store 4 double words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Store double word from 'in0' to (pdst) - Store double word from 'in1' to (pdst + stride) - Store double word from 'in2' to (pdst + 2 * stride) - Store double word from 'in3' to (pdst + 3 * stride) -*/ -#define SD4(in0, in1, in2, in3, pdst, stride) { \ - SD(in0, (pdst)) \ - SD(in1, (pdst) + stride); \ - SD(in2, (pdst) + 2 * stride); \ - SD(in3, (pdst) + 3 * stride); \ -} - -/* Description : Load vectors with 16 byte elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Load 16 byte elements in 'out0' from (psrc) - Load 16 byte elements in 'out1' from (psrc + stride) -*/ -#define LD_B2(RTYPE, psrc, stride, out0, out1) { \ - out0 = LD_B(RTYPE, (psrc)); \ - out1 = LD_B(RTYPE, (psrc) + stride); \ -} -#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) -#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) - -#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ - LD_B2(RTYPE, (psrc), stride, out0, out1); \ - LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ -} -#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) -#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) - -#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ -} -#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) -#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) - -#define LD_B7(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6) { \ - LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ - LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ -} -#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) - -#define LD_B8(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ -} -#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) -#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) - -/* Description : Load vectors with 8 halfword elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Details : Load 8 halfword elements in 'out0' from (psrc) - Load 8 halfword elements in 'out1' from (psrc + stride) -*/ -#define LD_H2(RTYPE, psrc, stride, out0, out1) { \ - out0 = LD_H(RTYPE, (psrc)); \ - out1 = LD_H(RTYPE, (psrc) + (stride)); \ -} -#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) - -#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ - LD_H2(RTYPE, (psrc), stride, out0, out1); \ - LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ -} -#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) - -#define LD_H8(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ - LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ -} -#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) - -#define LD_H16(RTYPE, psrc, stride, \ - out0, out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, out12, out13, out14, out15) { \ - LD_H8(RTYPE, (psrc), stride, \ - out0, out1, out2, out3, out4, out5, out6, out7); \ - LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ - out8, out9, out10, out11, out12, out13, out14, out15); \ -} -#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) - -/* Description : Load 4x4 block of signed halfword elements from 1D source - data into 4 vectors (Each vector with 4 signed halfwords) - Arguments : Input - psrc - Outputs - out0, out1, out2, out3 -*/ -#define LD4x4_SH(psrc, out0, out1, out2, out3) { \ - out0 = LD_SH(psrc); \ - out2 = LD_SH(psrc + 8); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ -} - -/* Description : Load 2 vectors of signed word elements with stride - Arguments : Inputs - psrc, stride - Outputs - out0, out1 - Return Type - signed word -*/ -#define LD_SW2(psrc, stride, out0, out1) { \ - out0 = LD_SW((psrc)); \ - out1 = LD_SW((psrc) + stride); \ -} - -/* Description : Store vectors of 16 byte elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 16 byte elements from 'in0' to (pdst) - Store 16 byte elements from 'in1' to (pdst + stride) -*/ -#define ST_B2(RTYPE, in0, in1, pdst, stride) { \ - ST_B(RTYPE, in0, (pdst)); \ - ST_B(RTYPE, in1, (pdst) + stride); \ -} -#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) - -#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ - ST_B2(RTYPE, in0, in1, (pdst), stride); \ - ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ -} -#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) - -#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - pdst, stride) { \ - ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ - ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ -} -#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) - -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) -*/ -#define ST_H2(RTYPE, in0, in1, pdst, stride) { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ -} -#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) - -#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ - ST_H2(RTYPE, in0, in1, (pdst), stride); \ - ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ -} -#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) - -#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ - ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ - ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ -} -#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) - -/* Description : Store vectors of word elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 4 word elements from 'in0' to (pdst) - Store 4 word elements from 'in1' to (pdst + stride) -*/ -#define ST_SW2(in0, in1, pdst, stride) { \ - ST_SW(in0, (pdst)); \ - ST_SW(in1, (pdst) + stride); \ -} - -/* Description : Store 2x4 byte block to destination memory from input vector - Arguments : Inputs - in, stidx, pdst, stride - Details : Index 'stidx' halfword element from 'in' vector is copied to - the GP register and stored to (pdst) - Index 'stidx+1' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + stride) - Index 'stidx+2' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + 2 * stride) - Index 'stidx+3' halfword element from 'in' vector is copied to - the GP register and stored to (pdst + 3 * stride) -*/ -#define ST2x4_UB(in, stidx, pdst, stride) { \ - uint16_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ - out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ - out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ - out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ - \ - SH(out0_m, pblk_2x4_m); \ - SH(out1_m, pblk_2x4_m + stride); \ - SH(out2_m, pblk_2x4_m + 2 * stride); \ - SH(out3_m, pblk_2x4_m + 3 * stride); \ -} - -/* Description : Store 4x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 word element from 'in' vector is copied to the GP - register and stored to (pdst) - Index 1 word element from 'in' vector is copied to the GP - register and stored to (pdst + stride) -*/ -#define ST4x2_UB(in, pdst, stride) { \ - uint32_t out0_m, out1_m; \ - uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in, 0); \ - out1_m = __msa_copy_u_w((v4i32)in, 1); \ - \ - SW(out0_m, pblk_4x2_m); \ - SW(out1_m, pblk_4x2_m + stride); \ -} - -/* Description : Store 4x4 byte block to destination memory from input vector - Arguments : Inputs - in0, in1, pdst, stride - Details : 'Idx0' word element from input vector 'in0' is copied to the - GP register and stored to (pdst) - 'Idx1' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + stride) - 'Idx2' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + 2 * stride) - 'Idx3' word element from input vector 'in0' is copied to the - GP register and stored to (pdst + 3 * stride) -*/ -#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ - out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ - out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ - out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ - \ - SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ -} -#define ST4x8_UB(in0, in1, pdst, stride) { \ - uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ - \ - ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ - ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ -} - -/* Description : Store 8x1 byte block to destination memory from input vector - Arguments : Inputs - in, pdst - Details : Index 0 double word element from 'in' vector is copied to the - GP register and stored to (pdst) -*/ -#define ST8x1_UB(in, pdst) { \ - uint64_t out0_m; \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - SD(out0_m, pdst); \ -} - -/* Description : Store 8x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 double word element from 'in' vector is copied to the - GP register and stored to (pdst) - Index 1 double word element from 'in' vector is copied to the - GP register and stored to (pdst + stride) -*/ -#define ST8x2_UB(in, pdst, stride) { \ - uint64_t out0_m, out1_m; \ - uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in, 0); \ - out1_m = __msa_copy_u_d((v2i64)in, 1); \ - \ - SD(out0_m, pblk_8x2_m); \ - SD(out1_m, pblk_8x2_m + stride); \ -} - -/* Description : Store 8x4 byte block to destination memory from input - vectors - Arguments : Inputs - in0, in1, pdst, stride - Details : Index 0 double word element from 'in0' vector is copied to the - GP register and stored to (pdst) - Index 1 double word element from 'in0' vector is copied to the - GP register and stored to (pdst + stride) - Index 0 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 2 * stride) - Index 1 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 3 * stride) -*/ -#define ST8x4_UB(in0, in1, pdst, stride) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in0, 0); \ - out1_m = __msa_copy_u_d((v2i64)in0, 1); \ - out2_m = __msa_copy_u_d((v2i64)in1, 0); \ - out3_m = __msa_copy_u_d((v2i64)in1, 1); \ - \ - SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ -} - -/* Description : average with rounding (in0 + in1 + 1) / 2. - Arguments : Inputs - in0, in1, in2, in3, - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned byte element from 'in0' vector is added with - each unsigned byte element from 'in1' vector. Then average - with rounding is calculated and written to 'out0' -*/ -#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ - out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ -} -#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) - -#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ - AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ -} -#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) - -/* Description : Immediate number of elements to slide with zero - Arguments : Inputs - in0, in1, slide_val - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'zero_m' vector are slide into 'in0' by - value specified in the 'slide_val' -*/ -#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ - v16i8 zero_m = { 0 }; \ - out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ -} -#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) - -#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ - out0, out1, out2, out3, slide_val) { \ - SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ - SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ -} -#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) - -/* Description : Immediate number of elements to slide - Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by - value specified in the 'slide_val' -*/ -#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ - out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ - out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ -} -#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) - -#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ - out0, out1, out2, slide_val) { \ - SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ - out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ -} -#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) -#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) - -/* Description : Shuffle byte vector elements as per mask vector - Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Byte elements from 'in0' & 'in1' are copied selectively to - 'out0' as per control vector 'mask0' -*/ -#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ - out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ -} -#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) -#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) -#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) - -#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ - out0, out1, out2, out3) { \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ - VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ -} -#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) -#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) - -/* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Unsigned byte elements from 'mult0' are multiplied with - unsigned byte elements from 'cnst0' producing a result - twice the size of input i.e. unsigned halfword. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ - out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ -} -#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) - -#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, \ - out0, out1, out2, out3) { \ - DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} -#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) - -/* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed byte elements from 'mult0' are multiplied with - signed byte elements from 'cnst0' producing a result - twice the size of input i.e. signed halfword. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ -} -#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) - -#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ - DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} -#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) - -/* Description : Dot product of halfword vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'mult0' are multiplied with - signed halfword elements from 'cnst0' producing a result - twice the size of input i.e. signed word. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ -} -#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) - -#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, \ - out0, out1, out2, out3) { \ - DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} -#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) - -/* Description : Dot product of word vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed word elements from 'mult0' are multiplied with - signed word elements from 'cnst0' producing a result - twice the size of input i.e. signed double word. - The multiplication result of adjacent odd-even elements - are added together and written to the 'out0' vector -*/ -#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ - out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ -} -#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) - -/* Description : Dot product & addition of byte vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed byte elements from 'mult0' are multiplied with - signed byte elements from 'cnst0' producing a result - twice the size of input i.e. signed halfword. - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ -} -#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) - -#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ - cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ - DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ - DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ -} -#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) - -/* Description : Dot product & addition of halfword vector elements - Arguments : Inputs - mult0, mult1, cnst0, cnst1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'mult0' are multiplied with - signed halfword elements from 'cnst0' producing a result - twice the size of input i.e. signed word. - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ - out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ - out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ -} -#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) - -/* Description : Dot product & addition of double word vector elements - Arguments : Inputs - mult0, mult1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each signed word element from 'mult0' is multiplied with itself - producing an intermediate result twice the size of input - i.e. signed double word - The multiplication result of adjacent odd-even elements - are added to the 'out0' vector -*/ -#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \ - out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ - out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ -} -#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) - -/* Description : Minimum values between unsigned elements of - either vector are copied to the output vector - Arguments : Inputs - in0, in1, min_vec - Outputs - in place operation - Return Type - as per RTYPE - Details : Minimum of unsigned halfword element values from 'in0' and - 'min_vec' are written to output vector 'in0' -*/ -#define MIN_UH2(RTYPE, in0, in1, min_vec) { \ - in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ - in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ -} -#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) - -#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \ - MIN_UH2(RTYPE, in0, in1, min_vec); \ - MIN_UH2(RTYPE, in2, in3, min_vec); \ -} -#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) - -/* Description : Clips all signed halfword elements of input vector - between 0 & 255 - Arguments : Input - in - Output - out_m - Return Type - signed halfword -*/ -#define CLIP_SH_0_255(in) ({ \ - v8i16 max_m = __msa_ldi_h(255); \ - v8i16 out_m; \ - \ - out_m = __msa_maxi_s_h((v8i16)in, 0); \ - out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ - out_m; \ -}) -#define CLIP_SH2_0_255(in0, in1) { \ - in0 = CLIP_SH_0_255(in0); \ - in1 = CLIP_SH_0_255(in1); \ -} -#define CLIP_SH4_0_255(in0, in1, in2, in3) { \ - CLIP_SH2_0_255(in0, in1); \ - CLIP_SH2_0_255(in2, in3); \ -} - -/* Description : Horizontal addition of 4 signed word elements of input vector - Arguments : Input - in (signed word vector) - Output - sum_m (i32 sum) - Return Type - signed word (GP) - Details : 4 signed word elements of 'in' vector are added together and - the resulting integer sum is returned -*/ -#define HADD_SW_S32(in) ({ \ - v2i64 res0_m, res1_m; \ - int32_t sum_m; \ - \ - res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ - res1_m = __msa_splati_d(res0_m, 1); \ - res0_m = res0_m + res1_m; \ - sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ - sum_m; \ -}) - -/* Description : Horizontal addition of unsigned byte vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned odd byte element from 'in0' is added to - even unsigned byte element from 'in0' (pairwise) and the - halfword result is written to 'out0' -*/ -#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ -} -#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) - -#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ - HADD_UB2(RTYPE, in0, in1, out0, out1); \ - HADD_UB2(RTYPE, in2, in3, out2, out3); \ -} -#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) - -/* Description : Horizontal subtraction of unsigned byte vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each unsigned odd byte element from 'in0' is subtracted from - even unsigned byte element from 'in0' (pairwise) and the - halfword result is written to 'out0' -*/ -#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ - out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ -} -#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) - -/* Description : Horizontal subtraction of signed halfword vector elements - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Each signed odd halfword element from 'in0' is subtracted from - even signed halfword element from 'in0' (pairwise) and the - word result is written to 'out0' -*/ -#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ - out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ -} -#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) - -/* Description : Set element n input vector to GPR value - Arguments : Inputs - in0, in1, in2, in3 - Output - out - Return Type - as per RTYPE - Details : Set element 0 in vector 'out' to value specified in 'in0' -*/ -#define INSERT_W2(RTYPE, in0, in1, out) { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ -} -#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) - -#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ - out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ - out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ -} -#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) -#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) - -#define INSERT_D2(RTYPE, in0, in1, out) { \ - out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ - out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ -} -#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) -#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) - -/* Description : Interleave even byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even byte elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ - out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ -} -#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) -#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) - -/* Description : Interleave even halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even halfword elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ - out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ -} -#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) -#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) -#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) - -/* Description : Interleave even word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even word elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ - out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ -} -#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) - -/* Description : Interleave even double word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even double word elements of 'in0' and 'in1' are interleaved - and written to 'out0' -*/ -#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ - out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ -} -#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) - -/* Description : Interleave left half of byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of byte elements of 'in0' and 'in1' are interleaved - and written to 'out0'. -*/ -#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ -} -#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) -#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) -#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) -#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) - -#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) -#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) - -/* Description : Interleave left half of halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of halfword elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ -} -#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) - -/* Description : Interleave left half of word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Left half of word elements of 'in0' and 'in1' are interleaved - and written to 'out0'. -*/ -#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ -} -#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) -#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) - -/* Description : Interleave right half of byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements of 'in0' and 'in1' are interleaved - and written to out0. -*/ -#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ -} -#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) -#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) -#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) -#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) - -#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) -#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) -#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) -#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) - -#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3); \ - ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ - out4, out5, out6, out7); \ -} -#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) - -/* Description : Interleave right half of halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of halfword elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ -} -#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) - -#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) - -#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ -} -#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) -#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) - -#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) - -/* Description : Interleave right half of double word elements from vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of double word elements of 'in0' and 'in1' are - interleaved and written to 'out0'. -*/ -#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ - out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ -} -#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) -#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) -#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) - -#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ -} -#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) - -#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) -#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) - -/* Description : Interleave both left and right half of input vectors - Arguments : Inputs - in0, in1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Right half of byte elements from 'in0' and 'in1' are - interleaved and written to 'out0' -*/ -#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ -} -#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) -#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) -#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) -#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) - -#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ -} -#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) -#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) - -#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ - out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ - out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ -} -#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) -#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) - -/* Description : Saturate the halfword element values to the max - unsigned value of (sat_val + 1) bits - The element data width remains unchanged - Arguments : Inputs - in0, in1, sat_val - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val + 1) bit range. - The results are written in place -*/ -#define SAT_UH2(RTYPE, in0, in1, sat_val) { \ - in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ -} -#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) - -#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \ - SAT_UH2(RTYPE, in0, in1, sat_val); \ - SAT_UH2(RTYPE, in2, in3, sat_val) \ -} -#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) - -/* Description : Saturate the halfword element values to the max - unsigned value of (sat_val + 1) bits - The element data width remains unchanged - Arguments : Inputs - in0, in1, sat_val - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val + 1) bit range - The results are written in place -*/ -#define SAT_SH2(RTYPE, in0, in1, sat_val) { \ - in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ - in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ -} -#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) - -#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \ - SAT_SH2(RTYPE, in0, in1, sat_val); \ - SAT_SH2(RTYPE, in2, in3, sat_val); \ -} -#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) - -/* Description : Indexed halfword element values are replicated to all - elements in output vector - Arguments : Inputs - in, idx0, idx1 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : 'idx0' element value from 'in' vector is replicated to all - elements in 'out0' vector - Valid index range for halfword operation is 0-7 -*/ -#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \ - out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ - out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ -} -#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) - -#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ - out0, out1, out2, out3) { \ - SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ - SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ -} -#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) -#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) - -/* Description : Pack even byte elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even byte elements of 'in0' are copied to the left half of - 'out0' & even byte elements of 'in1' are copied to the right - half of 'out0'. -*/ -#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ - out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ -} -#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) -#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) -#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) - -#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) -#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) -#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) - -/* Description : Pack even halfword elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even halfword elements of 'in0' are copied to the left half of - 'out0' & even halfword elements of 'in1' are copied to the - right half of 'out0'. -*/ -#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ -} -#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) -#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) - -#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) - -/* Description : Pack even double word elements of vector pairs - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Even double elements of 'in0' are copied to the left half of - 'out0' & even double elements of 'in1' are copied to the right - half of 'out0'. -*/ -#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ - out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ -} -#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) -#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) - -#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ - PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) - -/* Description : Each byte element is logically xor'ed with immediate 128 - Arguments : Inputs - in0, in1 - Outputs - in place operation - Return Type - as per RTYPE - Details : Each unsigned byte element from input vector 'in0' is - logically xor'ed with 128 and the result is stored in-place. -*/ -#define XORI_B2_128(RTYPE, in0, in1) { \ - in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ - in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ -} -#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) -#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) - -#define XORI_B3_128(RTYPE, in0, in1, in2) { \ - XORI_B2_128(RTYPE, in0, in1); \ - in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ -} -#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) - -#define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \ - XORI_B2_128(RTYPE, in0, in1); \ - XORI_B2_128(RTYPE, in2, in3); \ -} -#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) -#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) - -#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \ - XORI_B4_128(RTYPE, in0, in1, in2, in3); \ - XORI_B3_128(RTYPE, in4, in5, in6); \ -} -#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) - -/* Description : Average of signed halfword elements -> (a + b) / 2 - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3 - Return Type - as per RTYPE - Details : Each signed halfword element from 'in0' is added to each - signed halfword element of 'in1' with full precision resulting - in one extra bit in the result. The result is then divided by - 2 and written to 'out0' -*/ -#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ - out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ - out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ -} -#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) - -/* Description : Addition of signed halfword elements and signed saturation - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Return Type - as per RTYPE - Details : Signed halfword elements from 'in0' are added to signed - halfword elements of 'in1'. The result is then signed saturated - between halfword data type range -*/ -#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ - out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ - out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ -} -#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) - -#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ - ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ -} -#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) - -/* Description : Shift left all elements of vector (generic for all data types) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in place operation - Return Type - as per input vector RTYPE - Details : Each element of vector 'in0' is left shifted by 'shift' and - the result is written in-place. -*/ -#define SLLI_4V(in0, in1, in2, in3, shift) { \ - in0 = in0 << shift; \ - in1 = in1 << shift; \ - in2 = in2 << shift; \ - in3 = in3 << shift; \ -} - -/* Description : Arithmetic shift right all elements of vector - (generic for all data types) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in place operation - Return Type - as per input vector RTYPE - Details : Each element of vector 'in0' is right shifted by 'shift' and - the result is written in-place. 'shift' is a GP variable. -*/ -#define SRA_4V(in0, in1, in2, in3, shift) { \ - in0 = in0 >> shift; \ - in1 = in1 >> shift; \ - in2 = in2 >> shift; \ - in3 = in3 >> shift; \ -} - -/* Description : Shift right arithmetic rounded words - Arguments : Inputs - in0, in1, shift - Outputs - in place operation - Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetically by - the number of bits in the corresponding element in the vector - 'shift'. The last discarded bit is added to shifted value for - rounding and the result is written in-place. - 'shift' is a vector. -*/ -#define SRAR_W2(RTYPE, in0, in1, shift) { \ - in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ - in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ -} - -#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \ - SRAR_W2(RTYPE, in0, in1, shift) \ - SRAR_W2(RTYPE, in2, in3, shift) \ -} -#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) - -/* Description : Shift right arithmetic rounded (immediate) - Arguments : Inputs - in0, in1, shift - Outputs - in place operation - Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetically by - the value in 'shift'. The last discarded bit is added to the - shifted value for rounding and the result is written in-place. - 'shift' is an immediate value. -*/ -#define SRARI_H2(RTYPE, in0, in1, shift) { \ - in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ - in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ -} -#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) -#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) - -#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \ - SRARI_H2(RTYPE, in0, in1, shift); \ - SRARI_H2(RTYPE, in2, in3, shift); \ -} -#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) -#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) - -#define SRARI_W2(RTYPE, in0, in1, shift) { \ - in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ - in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ -} -#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) - -#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ - SRARI_W2(RTYPE, in0, in1, shift); \ - SRARI_W2(RTYPE, in2, in3, shift); \ -} -#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) - -/* Description : Logical shift right all elements of vector (immediate) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - out0, out1, out2, out3 - Return Type - as per RTYPE - Details : Each element of vector 'in0' is right shifted by 'shift' and - the result is written in-place. 'shift' is an immediate value. -*/ -#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \ - out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ - out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ - out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ - out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ -} -#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) - -/* Description : Multiplication of pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element from 'in0' is multiplied with elements from 'in1' - and the result is written to 'out0' -*/ -#define MUL2(in0, in1, in2, in3, out0, out1) { \ - out0 = in0 * in1; \ - out1 = in2 * in3; \ -} -#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - MUL2(in0, in1, in2, in3, out0, out1); \ - MUL2(in4, in5, in6, in7, out2, out3); \ -} - -/* Description : Addition of 2 pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element in 'in0' is added to 'in1' and result is written - to 'out0'. -*/ -#define ADD2(in0, in1, in2, in3, out0, out1) { \ - out0 = in0 + in1; \ - out1 = in2 + in3; \ -} -#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - ADD2(in0, in1, in2, in3, out0, out1); \ - ADD2(in4, in5, in6, in7, out2, out3); \ -} - -/* Description : Subtraction of 2 pairs of vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1 - Details : Each element in 'in1' is subtracted from 'in0' and result is - written to 'out0'. -*/ -#define SUB2(in0, in1, in2, in3, out0, out1) { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ -} -#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3) { \ - out0 = in0 - in1; \ - out1 = in2 - in3; \ - out2 = in4 - in5; \ - out3 = in6 - in7; \ -} - -/* Description : Sign extend halfword elements from right half of the vector - Arguments : Input - in (halfword vector) - Output - out (sign extended word vector) - Return Type - signed word - Details : Sign bit of halfword elements from input vector 'in' is - extracted and interleaved with same vector 'in0' to generate - 4 word elements keeping sign intact -*/ -#define UNPCK_R_SH_SW(in, out) { \ - v8i16 sign_m; \ - \ - sign_m = __msa_clti_s_h((v8i16)in, 0); \ - out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ -} - -/* Description : Zero extend unsigned byte elements to halfword elements - Arguments : Input - in (unsigned byte vector) - Outputs - out0, out1 (unsigned halfword vectors) - Return Type - signed halfword - Details : Zero extended right half of vector is returned in 'out0' - Zero extended left half of vector is returned in 'out1' -*/ -#define UNPCK_UB_SH(in, out0, out1) { \ - v16i8 zero_m = { 0 }; \ - \ - ILVRL_B2_SH(zero_m, in, out0, out1); \ -} - -/* Description : Sign extend halfword elements from input vector and return - the result in pair of vectors - Arguments : Input - in (halfword vector) - Outputs - out0, out1 (sign extended word vectors) - Return Type - signed word - Details : Sign bit of halfword elements from input vector 'in' is - extracted and interleaved right with same vector 'in0' to - generate 4 signed word elements in 'out0' - Then interleaved left with same vector 'in0' to - generate 4 signed word elements in 'out1' -*/ -#define UNPCK_SH_SW(in, out0, out1) { \ - v8i16 tmp_m; \ - \ - tmp_m = __msa_clti_s_h((v8i16)in, 0); \ - ILVRL_H2_SW(tmp_m, in, out0, out1); \ -} - -/* Description : Butterfly of 4 input vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Details : Butterfly operation -*/ -#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ - out0 = in0 + in3; \ - out1 = in1 + in2; \ - \ - out2 = in1 - in2; \ - out3 = in0 - in3; \ -} - -/* Description : Butterfly of 8 input vectors - Arguments : Inputs - in0 ... in7 - Outputs - out0 .. out7 - Details : Butterfly operation -*/ -#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - out0 = in0 + in7; \ - out1 = in1 + in6; \ - out2 = in2 + in5; \ - out3 = in3 + in4; \ - \ - out4 = in3 - in4; \ - out5 = in2 - in5; \ - out6 = in1 - in6; \ - out7 = in0 - in7; \ -} - -/* Description : Butterfly of 16 input vectors - Arguments : Inputs - in0 ... in15 - Outputs - out0 .. out15 - Details : Butterfly operation -*/ -#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15, \ - out0, out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, out12, out13, out14, out15) { \ - out0 = in0 + in15; \ - out1 = in1 + in14; \ - out2 = in2 + in13; \ - out3 = in3 + in12; \ - out4 = in4 + in11; \ - out5 = in5 + in10; \ - out6 = in6 + in9; \ - out7 = in7 + in8; \ - \ - out8 = in7 - in8; \ - out9 = in6 - in9; \ - out10 = in5 - in10; \ - out11 = in4 - in11; \ - out12 = in3 - in12; \ - out13 = in2 - in13; \ - out14 = in1 - in14; \ - out15 = in0 - in15; \ -} - -/* Description : Transpose input 8x8 byte block - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - as per RTYPE -*/ -#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ - tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ - ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ - ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ - ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ - SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ - SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ -} -#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) - -/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, - in8, in9, in10, in11, in12, in13, in14, in15 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - unsigned byte -*/ -#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ - in8, in9, in10, in11, in12, in13, in14, in15, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ - ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ - ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ - ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ - \ - tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ - tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ - tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ - tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ - out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ - tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ - out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ - tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ - \ - ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ - out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ - out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ - out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ - out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ - out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ -} - -/* Description : Transpose 4x4 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Return Type - signed halfword -*/ -#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v8i16 s0_m, s1_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ - ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ -} - -/* Description : Transpose 4x8 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - signed halfword -*/ -#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ - v8i16 zero_m = { 0 }; \ - \ - ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ - tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ - ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ - ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ - \ - out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ - out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ - \ - out4 = zero_m; \ - out5 = zero_m; \ - out6 = zero_m; \ - out7 = zero_m; \ -} - -/* Description : Transpose 8x4 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - signed halfword -*/ -#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ - ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ - ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ -} - -/* Description : Transpose 8x8 block with half word elements in vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - as per RTYPE -*/ -#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ - out0, out1, out2, out3, out4, out5, out6, out7) { \ - v8i16 s0_m, s1_m; \ - v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ - \ - ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ - ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ - ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ - ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ - ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ - PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ - tmp3_m, tmp7_m, out0, out2, out4, out6); \ - out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ - out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ - out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ - out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ -} -#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) - -/* Description : Transpose 4x4 block with word elements in vectors - Arguments : Inputs - in0, in1, in2, in3 - Outputs - out0, out1, out2, out3 - Return Type - signed word -*/ -#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ - ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ - \ - out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ - out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ - out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ - out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ -} - -/* Description : Add block 4x4 - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Least significant 4 bytes from each input vector are added to - the destination bytes, clipped between 0-255 and stored. -*/ -#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ - uint32_t src0_m, src1_m, src2_m, src3_m; \ - v8i16 inp0_m, inp1_m, res0_m, res1_m; \ - v16i8 dst0_m = { 0 }; \ - v16i8 dst1_m = { 0 }; \ - v16i8 zero_m = { 0 }; \ - \ - ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ - LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ - INSERT_W2_SB(src0_m, src1_m, dst0_m); \ - INSERT_W2_SB(src2_m, src3_m, dst1_m); \ - ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ - ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ - CLIP_SH2_0_255(res0_m, res1_m); \ - PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ - ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ -} - -/* Description : Pack even elements of input vectors & xor with 128 - Arguments : Inputs - in0, in1 - Output - out_m - Return Type - unsigned byte - Details : Signed byte even elements from 'in0' and 'in1' are packed - together in one vector and the resulting vector is xor'ed with - 128 to shift the range from signed to unsigned byte -*/ -#define PCKEV_XORI128_UB(in0, in1) ({ \ - v16u8 out_m; \ - \ - out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ - out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ - out_m; \ -}) - -/* Description : Converts inputs to unsigned bytes, interleave, average & store - as 8x4 unsigned byte block - Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, - pdst, stride -*/ -#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ - dst0, dst1, dst2, dst3, pdst, stride) { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - \ - tmp0_m = PCKEV_XORI128_UB(in0, in1); \ - tmp1_m = PCKEV_XORI128_UB(in2, in3); \ - ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ -} - -/* Description : Pack even byte elements and store byte vector in destination - memory - Arguments : Inputs - in0, in1, pdst -*/ -#define PCKEV_ST_SB(in0, in1, pdst) { \ - v16i8 tmp_m; \ - \ - tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ - ST_SB(tmp_m, (pdst)); \ -} - -/* Description : Horizontal 2 tap filter kernel code - Arguments : Inputs - in0, in1, mask, coeff, shift -*/ -#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ - v16i8 tmp0_m; \ - v8u16 tmp1_m; \ - \ - tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ - tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ - tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ - tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ - \ - tmp1_m; \ -}) -#endif /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */ diff --git a/vp9/common/mips/msa/vp9_mfqe_msa.c b/vp9/common/mips/msa/vp9_mfqe_msa.c index 64cb9a818..7257cd629 100644 --- a/vp9/common/mips/msa/vp9_mfqe_msa.c +++ b/vp9/common/mips/msa/vp9_mfqe_msa.c @@ -10,7 +10,7 @@ #include "./vp9_rtcd.h" #include "vp9/common/vp9_onyxc_int.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, int32_t dst_stride, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index ec7da5c0c..497f7e7c5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -802,88 +802,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { - -# variance -add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; - -# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form -add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc"; - -add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc"; -#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt - -add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; -specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; - add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p"; specialize qw/vp9_avg_8x8 sse2 neon msa/; @@ -1085,241 +1003,6 @@ specialize qw/vp9_temporal_filter_apply sse2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance4x4/; - - add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance4x4/; - - add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance4x4/; - - add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/; - - # ENCODEMB INVOKE add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; diff --git a/vp9/encoder/mips/msa/vp9_avg_msa.c b/vp9/encoder/mips/msa/vp9_avg_msa.c index f2e8b275a..611adb1a2 100644 --- a/vp9/encoder/mips/msa/vp9_avg_msa.c +++ b/vp9/encoder/mips/msa/vp9_avg_msa.c @@ -9,7 +9,7 @@ */ #include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { uint32_t sum_out; diff --git a/vp9/encoder/mips/msa/vp9_error_msa.c b/vp9/encoder/mips/msa/vp9_error_msa.c index 9709092fc..1dc70bd82 100644 --- a/vp9/encoder/mips/msa/vp9_error_msa.c +++ b/vp9/encoder/mips/msa/vp9_error_msa.c @@ -9,7 +9,7 @@ */ #include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" #define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \ static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr, \ diff --git a/vp9/encoder/mips/msa/vp9_fdct_msa.h b/vp9/encoder/mips/msa/vp9_fdct_msa.h index ad66576b6..d111421aa 100644 --- a/vp9/encoder/mips/msa/vp9_fdct_msa.h +++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -13,7 +13,7 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_idct.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) { \ v8i16 k0_m = __msa_fill_h(cnst0); \ diff --git a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c index 4053bffae..363aabb7c 100644 --- a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c +++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c @@ -9,7 +9,7 @@ */ #include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride, diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 721cd81bf..f27f57a24 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1023,8 +1023,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x16_bits8, vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16, - vp9_highbd_sub_pixel_variance32x16, - vp9_highbd_sub_pixel_avg_variance32x16, + vpx_highbd_8_sub_pixel_variance32x16, + vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL, vpx_highbd_sad32x16x4d_bits8) @@ -1033,8 +1033,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x32_bits8, vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32, - vp9_highbd_sub_pixel_variance16x32, - vp9_highbd_sub_pixel_avg_variance16x32, + vpx_highbd_8_sub_pixel_variance16x32, + vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL, vpx_highbd_sad16x32x4d_bits8) @@ -1043,8 +1043,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad64x32_bits8, vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32, - vp9_highbd_sub_pixel_variance64x32, - vp9_highbd_sub_pixel_avg_variance64x32, + vpx_highbd_8_sub_pixel_variance64x32, + vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL, vpx_highbd_sad64x32x4d_bits8) @@ -1053,8 +1053,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x64_bits8, vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64, - vp9_highbd_sub_pixel_variance32x64, - vp9_highbd_sub_pixel_avg_variance32x64, + vpx_highbd_8_sub_pixel_variance32x64, + vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL, vpx_highbd_sad32x64x4d_bits8) @@ -1063,8 +1063,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x32_bits8, vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32, - vp9_highbd_sub_pixel_variance32x32, - vp9_highbd_sub_pixel_avg_variance32x32, + vpx_highbd_8_sub_pixel_variance32x32, + vpx_highbd_8_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8, vpx_highbd_sad32x32x4d_bits8) @@ -1073,8 +1073,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad64x64_bits8, vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64, - vp9_highbd_sub_pixel_variance64x64, - vp9_highbd_sub_pixel_avg_variance64x64, + vpx_highbd_8_sub_pixel_variance64x64, + vpx_highbd_8_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8, vpx_highbd_sad64x64x4d_bits8) @@ -1083,8 +1083,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x16_bits8, vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16, - vp9_highbd_sub_pixel_variance16x16, - vp9_highbd_sub_pixel_avg_variance16x16, + vpx_highbd_8_sub_pixel_variance16x16, + vpx_highbd_8_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8, vpx_highbd_sad16x16x4d_bits8) @@ -1093,8 +1093,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8, - vp9_highbd_sub_pixel_variance16x8, - vp9_highbd_sub_pixel_avg_variance16x8, + vpx_highbd_8_sub_pixel_variance16x8, + vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8, vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8) @@ -1103,8 +1103,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16, - vp9_highbd_sub_pixel_variance8x16, - vp9_highbd_sub_pixel_avg_variance8x16, + vpx_highbd_8_sub_pixel_variance8x16, + vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8, vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8) @@ -1113,8 +1113,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8, vpx_highbd_8_variance8x8, - vp9_highbd_sub_pixel_variance8x8, - vp9_highbd_sub_pixel_avg_variance8x8, + vpx_highbd_8_sub_pixel_variance8x8, + vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8, vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8) @@ -1123,8 +1123,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4, - vp9_highbd_sub_pixel_variance8x4, - vp9_highbd_sub_pixel_avg_variance8x4, + vpx_highbd_8_sub_pixel_variance8x4, + vpx_highbd_8_sub_pixel_avg_variance8x4, NULL, vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8) @@ -1133,8 +1133,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8, - vp9_highbd_sub_pixel_variance4x8, - vp9_highbd_sub_pixel_avg_variance4x8, + vpx_highbd_8_sub_pixel_variance4x8, + vpx_highbd_8_sub_pixel_avg_variance4x8, NULL, vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8) @@ -1143,8 +1143,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8, vpx_highbd_8_variance4x4, - vp9_highbd_sub_pixel_variance4x4, - vp9_highbd_sub_pixel_avg_variance4x4, + vpx_highbd_8_sub_pixel_variance4x4, + vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8, vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8) @@ -1155,8 +1155,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x16_bits10, vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16, - vp9_highbd_10_sub_pixel_variance32x16, - vp9_highbd_10_sub_pixel_avg_variance32x16, + vpx_highbd_10_sub_pixel_variance32x16, + vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, vpx_highbd_sad32x16x4d_bits10) @@ -1165,8 +1165,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x32_bits10, vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32, - vp9_highbd_10_sub_pixel_variance16x32, - vp9_highbd_10_sub_pixel_avg_variance16x32, + vpx_highbd_10_sub_pixel_variance16x32, + vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, vpx_highbd_sad16x32x4d_bits10) @@ -1175,8 +1175,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad64x32_bits10, vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32, - vp9_highbd_10_sub_pixel_variance64x32, - vp9_highbd_10_sub_pixel_avg_variance64x32, + vpx_highbd_10_sub_pixel_variance64x32, + vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, vpx_highbd_sad64x32x4d_bits10) @@ -1185,8 +1185,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x64_bits10, vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64, - vp9_highbd_10_sub_pixel_variance32x64, - vp9_highbd_10_sub_pixel_avg_variance32x64, + vpx_highbd_10_sub_pixel_variance32x64, + vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, vpx_highbd_sad32x64x4d_bits10) @@ -1195,8 +1195,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x32_bits10, vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32, - vp9_highbd_10_sub_pixel_variance32x32, - vp9_highbd_10_sub_pixel_avg_variance32x32, + vpx_highbd_10_sub_pixel_variance32x32, + vpx_highbd_10_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10, vpx_highbd_sad32x32x4d_bits10) @@ -1205,8 +1205,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad64x64_bits10, vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64, - vp9_highbd_10_sub_pixel_variance64x64, - vp9_highbd_10_sub_pixel_avg_variance64x64, + vpx_highbd_10_sub_pixel_variance64x64, + vpx_highbd_10_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10, vpx_highbd_sad64x64x4d_bits10) @@ -1215,8 +1215,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x16_bits10, vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16, - vp9_highbd_10_sub_pixel_variance16x16, - vp9_highbd_10_sub_pixel_avg_variance16x16, + vpx_highbd_10_sub_pixel_variance16x16, + vpx_highbd_10_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10, vpx_highbd_sad16x16x4d_bits10) @@ -1225,8 +1225,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x8_bits10, vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8, - vp9_highbd_10_sub_pixel_variance16x8, - vp9_highbd_10_sub_pixel_avg_variance16x8, + vpx_highbd_10_sub_pixel_variance16x8, + vpx_highbd_10_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10, vpx_highbd_sad16x8x4d_bits10) @@ -1235,8 +1235,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x16_bits10, vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16, - vp9_highbd_10_sub_pixel_variance8x16, - vp9_highbd_10_sub_pixel_avg_variance8x16, + vpx_highbd_10_sub_pixel_variance8x16, + vpx_highbd_10_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10, vpx_highbd_sad8x16x4d_bits10) @@ -1245,8 +1245,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8, - vp9_highbd_10_sub_pixel_variance8x8, - vp9_highbd_10_sub_pixel_avg_variance8x8, + vpx_highbd_10_sub_pixel_variance8x8, + vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10, vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10) @@ -1255,8 +1255,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x4_bits10, vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4, - vp9_highbd_10_sub_pixel_variance8x4, - vp9_highbd_10_sub_pixel_avg_variance8x4, + vpx_highbd_10_sub_pixel_variance8x4, + vpx_highbd_10_sub_pixel_avg_variance8x4, NULL, vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10) @@ -1265,8 +1265,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x8_bits10, vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8, - vp9_highbd_10_sub_pixel_variance4x8, - vp9_highbd_10_sub_pixel_avg_variance4x8, + vpx_highbd_10_sub_pixel_variance4x8, + vpx_highbd_10_sub_pixel_avg_variance4x8, NULL, vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10) @@ -1275,8 +1275,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4, - vp9_highbd_10_sub_pixel_variance4x4, - vp9_highbd_10_sub_pixel_avg_variance4x4, + vpx_highbd_10_sub_pixel_variance4x4, + vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10, vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10) @@ -1287,8 +1287,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, - vp9_highbd_12_sub_pixel_variance32x16, - vp9_highbd_12_sub_pixel_avg_variance32x16, + vpx_highbd_12_sub_pixel_variance32x16, + vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, vpx_highbd_sad32x16x4d_bits12) @@ -1297,8 +1297,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x32_bits12, vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32, - vp9_highbd_12_sub_pixel_variance16x32, - vp9_highbd_12_sub_pixel_avg_variance16x32, + vpx_highbd_12_sub_pixel_variance16x32, + vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, vpx_highbd_sad16x32x4d_bits12) @@ -1307,8 +1307,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad64x32_bits12, vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32, - vp9_highbd_12_sub_pixel_variance64x32, - vp9_highbd_12_sub_pixel_avg_variance64x32, + vpx_highbd_12_sub_pixel_variance64x32, + vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, vpx_highbd_sad64x32x4d_bits12) @@ -1317,8 +1317,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x64_bits12, vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64, - vp9_highbd_12_sub_pixel_variance32x64, - vp9_highbd_12_sub_pixel_avg_variance32x64, + vpx_highbd_12_sub_pixel_variance32x64, + vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, vpx_highbd_sad32x64x4d_bits12) @@ -1327,8 +1327,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad32x32_bits12, vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32, - vp9_highbd_12_sub_pixel_variance32x32, - vp9_highbd_12_sub_pixel_avg_variance32x32, + vpx_highbd_12_sub_pixel_variance32x32, + vpx_highbd_12_sub_pixel_avg_variance32x32, vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12, vpx_highbd_sad32x32x4d_bits12) @@ -1337,8 +1337,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad64x64_bits12, vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64, - vp9_highbd_12_sub_pixel_variance64x64, - vp9_highbd_12_sub_pixel_avg_variance64x64, + vpx_highbd_12_sub_pixel_variance64x64, + vpx_highbd_12_sub_pixel_avg_variance64x64, vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12, vpx_highbd_sad64x64x4d_bits12) @@ -1347,8 +1347,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x16_bits12, vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16, - vp9_highbd_12_sub_pixel_variance16x16, - vp9_highbd_12_sub_pixel_avg_variance16x16, + vpx_highbd_12_sub_pixel_variance16x16, + vpx_highbd_12_sub_pixel_avg_variance16x16, vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12, vpx_highbd_sad16x16x4d_bits12) @@ -1357,8 +1357,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad16x8_bits12, vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8, - vp9_highbd_12_sub_pixel_variance16x8, - vp9_highbd_12_sub_pixel_avg_variance16x8, + vpx_highbd_12_sub_pixel_variance16x8, + vpx_highbd_12_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12, vpx_highbd_sad16x8x4d_bits12) @@ -1367,8 +1367,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x16_bits12, vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16, - vp9_highbd_12_sub_pixel_variance8x16, - vp9_highbd_12_sub_pixel_avg_variance8x16, + vpx_highbd_12_sub_pixel_variance8x16, + vpx_highbd_12_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12, vpx_highbd_sad8x16x4d_bits12) @@ -1377,8 +1377,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8, - vp9_highbd_12_sub_pixel_variance8x8, - vp9_highbd_12_sub_pixel_avg_variance8x8, + vpx_highbd_12_sub_pixel_variance8x8, + vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12, vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12) @@ -1387,8 +1387,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad8x4_bits12, vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4, - vp9_highbd_12_sub_pixel_variance8x4, - vp9_highbd_12_sub_pixel_avg_variance8x4, + vpx_highbd_12_sub_pixel_variance8x4, + vpx_highbd_12_sub_pixel_avg_variance8x4, NULL, vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12) @@ -1397,8 +1397,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x8_bits12, vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8, - vp9_highbd_12_sub_pixel_variance4x8, - vp9_highbd_12_sub_pixel_avg_variance4x8, + vpx_highbd_12_sub_pixel_variance4x8, + vpx_highbd_12_sub_pixel_avg_variance4x8, NULL, vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12) @@ -1407,8 +1407,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4, - vp9_highbd_12_sub_pixel_variance4x4, - vp9_highbd_12_sub_pixel_avg_variance4x4, + vpx_highbd_12_sub_pixel_variance4x4, + vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12, vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12) @@ -1832,62 +1832,62 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, - vpx_variance32x16, vp9_sub_pixel_variance32x16, - vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d) + vpx_variance32x16, vpx_sub_pixel_variance32x16, + vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d) BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, - vpx_variance16x32, vp9_sub_pixel_variance16x32, - vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d) + vpx_variance16x32, vpx_sub_pixel_variance16x32, + vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d) BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, - vpx_variance64x32, vp9_sub_pixel_variance64x32, - vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d) + vpx_variance64x32, vpx_sub_pixel_variance64x32, + vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d) BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, - vpx_variance32x64, vp9_sub_pixel_variance32x64, - vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d) + vpx_variance32x64, vpx_sub_pixel_variance32x64, + vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d) BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, - vpx_variance32x32, vp9_sub_pixel_variance32x32, - vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8, + vpx_variance32x32, vpx_sub_pixel_variance32x32, + vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d) BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, - vpx_variance64x64, vp9_sub_pixel_variance64x64, - vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8, + vpx_variance64x64, vpx_sub_pixel_variance64x64, + vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d) BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, - vpx_variance16x16, vp9_sub_pixel_variance16x16, - vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8, + vpx_variance16x16, vpx_sub_pixel_variance16x16, + vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d) BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, - vpx_variance16x8, vp9_sub_pixel_variance16x8, - vp9_sub_pixel_avg_variance16x8, + vpx_variance16x8, vpx_sub_pixel_variance16x8, + vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d) BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, - vpx_variance8x16, vp9_sub_pixel_variance8x16, - vp9_sub_pixel_avg_variance8x16, + vpx_variance8x16, vpx_sub_pixel_variance8x16, + vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d) BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, - vpx_variance8x8, vp9_sub_pixel_variance8x8, - vp9_sub_pixel_avg_variance8x8, + vpx_variance8x8, vpx_sub_pixel_variance8x8, + vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d) BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, - vpx_variance8x4, vp9_sub_pixel_variance8x4, - vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d) + vpx_variance8x4, vpx_sub_pixel_variance8x4, + vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d) BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, - vpx_variance4x8, vp9_sub_pixel_variance4x8, - vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d) + vpx_variance4x8, vpx_sub_pixel_variance4x8, + vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d) BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, - vpx_variance4x4, vp9_sub_pixel_variance4x4, - vp9_sub_pixel_avg_variance4x4, + vpx_variance4x4, vpx_sub_pixel_variance4x4, + vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 10ad2d391..f095cada2 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -40,7 +40,7 @@ #include "vp9/encoder/vp9_speed_features.h" #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_tokenize.h" -#include "vp9/encoder/vp9_variance.h" +#include "vpx_dsp/variance.h" #if CONFIG_VP9_TEMPORAL_DENOISING #include "vp9/encoder/vp9_denoiser.h" diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index e9ef29488..a85b70b99 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -35,7 +35,7 @@ #include "vp9/encoder/vp9_mcomp.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/encoder/vp9_rd.h" -#include "vp9/encoder/vp9_variance.h" +#include "vpx_dsp/variance.h" #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 @@ -298,7 +298,7 @@ void vp9_end_first_pass(VP9_COMP *cpi) { } } -static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { +static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) { switch (bsize) { case BLOCK_8X8: return vpx_mse8x8; @@ -315,13 +315,13 @@ static unsigned int get_prediction_error(BLOCK_SIZE bsize, const struct buf_2d *src, const struct buf_2d *ref) { unsigned int sse; - const vp9_variance_fn_t fn = get_block_variance_fn(bsize); + const vpx_variance_fn_t fn = get_block_variance_fn(bsize); fn(src->buf, src->stride, ref->buf, ref->stride, &sse); return sse; } #if CONFIG_VP9_HIGHBITDEPTH -static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, +static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize, int bd) { switch (bd) { default: @@ -368,7 +368,7 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize, const struct buf_2d *ref, int bd) { unsigned int sse; - const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); + const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd); fn(src->buf, src->stride, ref->buf, ref->stride, &sse); return sse; } diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 99c1afa28..817bd7959 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -13,7 +13,7 @@ #define VP9_ENCODER_VP9_MCOMP_H_ #include "vp9/encoder/vp9_block.h" -#include "vp9/encoder/vp9_variance.h" +#include "vpx_dsp/variance.h" #ifdef __cplusplus extern "C" { diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 930561aad..bc7cb34ea 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -37,7 +37,6 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_tokenize.h" -#include "vp9/encoder/vp9_variance.h" #define RD_THRESH_POW 1.25 #define RD_MULT_EPB_RATIO 64 diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 697667935..3a27e8989 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -39,7 +39,6 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" -#include "vp9/encoder/vp9_variance.h" #include "vp9/encoder/vp9_aq_variance.h" #define LAST_FRAME_MODE_MASK ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \ diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c deleted file mode 100644 index 6f26996df..000000000 --- a/vp9/encoder/vp9_variance.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "./vpx_dsp_rtcd.h" - -#include "vpx_ports/mem.h" -#include "vpx/vpx_integer.h" - -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_filter.h" - -#include "vp9/encoder/vp9_variance.h" - -static const uint8_t bilinear_filters[8][2] = { - { 128, 0, }, - { 112, 16, }, - { 96, 32, }, - { 80, 48, }, - { 64, 64, }, - { 48, 80, }, - { 32, 96, }, - { 16, 112, }, -}; - -// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal -// or vertical direction to produce the filtered output block. Used to implement -// first-pass of 2-D separable filter. -// -// Produces int32_t output to retain precision for next pass. Two filter taps -// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is -// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It -// defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal -// or vertical direction to produce the filtered output block. Used to implement -// second-pass of 2-D separable filter. -// -// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two -// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the -// filter is applied horizontally (pixel_step=1) or vertically (pixel_step= -// stride). It defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - src_ptr++; - } - - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -#define SUBPIX_VAR(W, H) \ -unsigned int vp9_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ -\ - var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \ -} - -#define SUBPIX_AVG_VAR(W, H) \ -unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ -\ - var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - bilinear_filters[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ -\ - return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \ -} - -SUBPIX_VAR(4, 4) -SUBPIX_AVG_VAR(4, 4) - -SUBPIX_VAR(4, 8) -SUBPIX_AVG_VAR(4, 8) - -SUBPIX_VAR(8, 4) -SUBPIX_AVG_VAR(8, 4) - -SUBPIX_VAR(8, 8) -SUBPIX_AVG_VAR(8, 8) - -SUBPIX_VAR(8, 16) -SUBPIX_AVG_VAR(8, 16) - -SUBPIX_VAR(16, 8) -SUBPIX_AVG_VAR(16, 8) - -SUBPIX_VAR(16, 16) -SUBPIX_AVG_VAR(16, 16) - -SUBPIX_VAR(16, 32) -SUBPIX_AVG_VAR(16, 32) - -SUBPIX_VAR(32, 16) -SUBPIX_AVG_VAR(32, 16) - -SUBPIX_VAR(32, 32) -SUBPIX_AVG_VAR(32, 32) - -SUBPIX_VAR(32, 64) -SUBPIX_AVG_VAR(32, 64) - -SUBPIX_VAR(64, 32) -SUBPIX_AVG_VAR(64, 32) - -SUBPIX_VAR(64, 64) -SUBPIX_AVG_VAR(64, 64) - -#if CONFIG_VP9_HIGHBITDEPTH -static void highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *vp9_filter) { - unsigned int i, j; - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = - ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -static void highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, - uint16_t *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = - ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + - (int)src_ptr[pixel_step] * vp9_filter[1], - FILTER_BITS); - src_ptr++; - } - - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -#define HIGHBD_SUBPIX_VAR(W, H) \ -unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ - dst_stride, sse); \ -} \ -\ -unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, dst, dst_stride, sse); \ -} \ -\ -unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ -\ - highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ - W, dst, dst_stride, sse); \ -} - -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ -unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ -\ - highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ -\ - return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ -} \ -\ -unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ -\ - highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ -\ - return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ - W, dst, dst_stride, sse); \ -} \ -\ -unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, \ - int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, \ - unsigned int *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ -\ - highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters[xoffset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[yoffset]); \ -\ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ -\ - return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ - W, dst, dst_stride, sse); \ -} - -HIGHBD_SUBPIX_VAR(4, 4) -HIGHBD_SUBPIX_AVG_VAR(4, 4) - -HIGHBD_SUBPIX_VAR(4, 8) -HIGHBD_SUBPIX_AVG_VAR(4, 8) - -HIGHBD_SUBPIX_VAR(8, 4) -HIGHBD_SUBPIX_AVG_VAR(8, 4) - -HIGHBD_SUBPIX_VAR(8, 8) -HIGHBD_SUBPIX_AVG_VAR(8, 8) - -HIGHBD_SUBPIX_VAR(8, 16) -HIGHBD_SUBPIX_AVG_VAR(8, 16) - -HIGHBD_SUBPIX_VAR(16, 8) -HIGHBD_SUBPIX_AVG_VAR(16, 8) - -HIGHBD_SUBPIX_VAR(16, 16) -HIGHBD_SUBPIX_AVG_VAR(16, 16) - -HIGHBD_SUBPIX_VAR(16, 32) -HIGHBD_SUBPIX_AVG_VAR(16, 32) - -HIGHBD_SUBPIX_VAR(32, 16) -HIGHBD_SUBPIX_AVG_VAR(32, 16) - -HIGHBD_SUBPIX_VAR(32, 32) -HIGHBD_SUBPIX_AVG_VAR(32, 32) - -HIGHBD_SUBPIX_VAR(32, 64) -HIGHBD_SUBPIX_AVG_VAR(32, 64) - -HIGHBD_SUBPIX_VAR(64, 32) -HIGHBD_SUBPIX_AVG_VAR(64, 32) - -HIGHBD_SUBPIX_VAR(64, 64) -HIGHBD_SUBPIX_AVG_VAR(64, 64) -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h deleted file mode 100644 index 0a8739510..000000000 --- a/vp9/encoder/vp9_variance.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_ENCODER_VP9_VARIANCE_H_ -#define VP9_ENCODER_VP9_VARIANCE_H_ - -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride); - -typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - const uint8_t *second_pred); - -typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sad_array); - -typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t* const ref_ptr[], - int ref_stride, unsigned int *sad_array); - -typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int ref_stride, - unsigned int *sse); - -typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr, - int source_stride, - int xoffset, - int yoffset, - const uint8_t *ref_ptr, - int Refstride, - unsigned int *sse); - -typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr, - int source_stride, - int xoffset, - int yoffset, - const uint8_t *ref_ptr, - int Refstride, - unsigned int *sse, - const uint8_t *second_pred); - -typedef struct vp9_variance_vtable { - vp9_sad_fn_t sdf; - vp9_sad_avg_fn_t sdaf; - vp9_variance_fn_t vf; - vp9_subpixvariance_fn_t svf; - vp9_subp_avg_variance_fn_t svaf; - vp9_sad_multi_fn_t sdx3f; - vp9_sad_multi_fn_t sdx8f; - vp9_sad_multi_d_fn_t sdx4df; -} vp9_variance_fn_ptr_t; - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP9_ENCODER_VP9_VARIANCE_H_ diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c deleted file mode 100644 index 29b7b2782..000000000 --- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "./vpx_config.h" -#include "vp9/common/vp9_common.h" - -#include "vp9/encoder/vp9_variance.h" -#include "vpx_ports/mem.h" - -#define DECL(w, opt) \ -int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint16_t *dst, \ - ptrdiff_t dst_stride, \ - int height, unsigned int *sse); -#define DECLS(opt1, opt2) \ -DECL(8, opt1); \ -DECL(16, opt1) - -DECLS(sse2, sse); -// DECLS(ssse3, ssse3); -#undef DECLS -#undef DECL - -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ -uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ - int src_stride, \ - int x_offset, \ - int y_offset, \ - const uint8_t *dst8, \ - int dst_stride, \ - uint32_t *sse_ptr) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, h, \ - &sse); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ - src_stride, \ - x_offset, y_offset, \ - dst + 16, \ - dst_stride, \ - h, &sse2); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - h, &sse2); \ - se += se2; \ - sse += sse2; \ - se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, h, &sse2); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - h, &sse); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ - src_stride, \ - x_offset, y_offset, \ - dst + 16, \ - dst_stride, \ - h, &sse2); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - h, &sse2); \ - se += se2; \ - sse += sse2; \ - se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - h, &sse2); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - int start_row; \ - uint32_t sse; \ - int se = 0; \ - uint64_t long_sse = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - for (start_row = 0; start_row < h; start_row +=16) { \ - uint32_t sse2; \ - int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - }\ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 4); \ - sse = ROUND_POWER_OF_TWO(long_sse, 8); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} - -#define FNS(opt1, opt2) \ -FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ -FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ -FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ -FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ -FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ -FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ -FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ -FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ -FN(8, 16, 8, 3, 4, opt1, (int64_t)); \ -FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ -FN(8, 4, 8, 3, 2, opt1, (int64_t)); - - -FNS(sse2, sse); - -#undef FNS -#undef FN - -#define DECL(w, opt) \ -int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint16_t *dst, \ - ptrdiff_t dst_stride, \ - const uint16_t *sec, \ - ptrdiff_t sec_stride, \ - int height, \ - unsigned int *sse); -#define DECLS(opt1) \ -DECL(16, opt1) \ -DECL(8, opt1) - -DECLS(sse2); -#undef DECL -#undef DECLS - -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ -uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, \ - y_offset, dst, dst_stride, sec, w, h, &sse); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, \ - dst + 16, dst_stride, sec + 16, w, h, &sse2); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, \ - dst + 32, dst_stride, sec + 32, w, h, &sse2); \ - se += se2; \ - sse += sse2; \ - se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, sec + 48, w, h, &sse2); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - uint32_t sse; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - sec, w, h, &sse); \ - if (w > wf) { \ - uint32_t sse2; \ - int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, \ - x_offset, y_offset, \ - dst + 16, dst_stride, \ - sec + 16, w, h, &sse2); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - sec + 32, w, h, &sse2); \ - se += se2; \ - sse += sse2; \ - se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - sec + 48, w, h, &sse2); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} \ -\ -uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ - const uint8_t *sec8) { \ - int start_row; \ - uint32_t sse; \ - int se = 0; \ - uint64_t long_sse = 0; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ - for (start_row = 0; start_row < h; start_row +=16) { \ - uint32_t sse2; \ - int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + (start_row * dst_stride), dst_stride, \ - sec + (start_row * w), w, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, \ - dst + 16 + (start_row * dst_stride), dst_stride, \ - sec + 16 + (start_row * w), w, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, \ - dst + 32 + (start_row * dst_stride), dst_stride, \ - sec + 32 + (start_row * w), w, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, \ - x_offset, y_offset, \ - dst + 48 + (start_row * dst_stride), dst_stride, \ - sec + 48 + (start_row * w), w, height, &sse2); \ - se += se2; \ - long_sse += sse2; \ - } \ - } \ - } \ - se = ROUND_POWER_OF_TWO(se, 4); \ - sse = ROUND_POWER_OF_TWO(long_sse, 8); \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} - - -#define FNS(opt1) \ -FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ -FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ -FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ -FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ -FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ -FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ -FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ -FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ -FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ -FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ -FN(8, 4, 8, 3, 2, opt1, (int64_t)); - -FNS(sse2); - -#undef FNS -#undef FN diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c deleted file mode 100644 index b1c797520..000000000 --- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c +++ /dev/null @@ -1,525 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // AVX2 - -#include "./vp9_rtcd.h" -#include "vpx_ports/mem.h" -#include "vp9/encoder/vp9_variance.h" - -DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, - 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, -}; - -#define FILTER_SRC(filter) \ - /* filter the source */ \ - exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ - exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ - \ - /* add 8 to source */ \ - exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ - exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ - \ - /* divide source by 16 */ \ - exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ - exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); - -#define MERGE_WITH_SRC(src_reg, reg) \ - exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ - exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); - -#define LOAD_SRC_DST \ - /* load source and destination */ \ - src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ - dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); - -#define AVG_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *) \ - (src + size_stride)); \ - /* average between current and next stride source */ \ - src_reg = _mm256_avg_epu8(src_reg, src_next_reg); - -#define MERGE_NEXT_SRC(src_reg, size_stride) \ - src_next_reg = _mm256_loadu_si256((__m256i const *) \ - (src + size_stride)); \ - MERGE_WITH_SRC(src_reg, src_next_reg) - -#define CALC_SUM_SSE_INSIDE_LOOP \ - /* expand each byte to 2 bytes */ \ - exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ - exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ - /* source - dest */ \ - exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ - exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ - /* caculate sum */ \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ - exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ - sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ - exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ - /* calculate sse */ \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ - sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); - -// final calculation to sum and sse -#define CALC_SUM_AND_SSE \ - res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ - sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ - sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ - \ - sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ - \ - sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ - sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ - sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ - sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); - - -unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - int height, - unsigned int *sse) { - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - // x_offset = 0 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { - src_avg = src_reg; - src+= src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - // save current source average - CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { - // save current source average - src_avg = src_reg; - src+= src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src_pack = src_reg; - dst+= dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} - -unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - const uint8_t *sec, - int sec_stride, - int height, - unsigned int *sse) { - __m256i sec_reg; - __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; - __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; - __m256i zero_reg; - int i, sum; - sum_reg = _mm256_set1_epi16(0); - sse_reg = _mm256_set1_epi16(0); - zero_reg = _mm256_set1_epi16(0); - - // x_offset = 0 and y_offset = 0 - if (x_offset == 0) { - if (y_offset == 0) { - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - } else if (y_offset == 8) { - __m256i src_next_reg; - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, src_stride) - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; - // expend each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - // x_offset = 0 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg; - - y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, src_stride) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - } - // x_offset = 8 and y_offset = 0 - } else if (x_offset == 8) { - if (y_offset == 0) { - __m256i src_next_reg; - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - sec+= sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_reg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - // x_offset = 8 and y_offset = 8 - } else if (y_offset == 8) { - __m256i src_next_reg, src_avg; - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { - // save current source average - src_avg = src_reg; - src+= src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - // average between previous average to current average - src_avg = _mm256_avg_epu8(src_avg, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - sec+= sec_stride; - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; - } - // x_offset = 8 and y_offset = bilin interpolation - } else { - __m256i filter, pw8, src_next_reg, src_avg; - y_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - AVG_NEXT_SRC(src_reg, 1) - for (i = 0; i < height ; i++) { - // save current source average - src_avg = src_reg; - src+= src_stride; - LOAD_SRC_DST - AVG_NEXT_SRC(src_reg, 1) - MERGE_WITH_SRC(src_avg, src_reg) - FILTER_SRC(filter) - src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_avg = _mm256_avg_epu8(src_avg, sec_reg); - // expand each byte to 2 bytes - MERGE_WITH_SRC(src_avg, zero_reg) - sec+= sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; - } - } - // x_offset = bilin interpolation and y_offset = 0 - } else { - if (y_offset == 0) { - __m256i filter, pw8, src_next_reg; - x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - for (i = 0; i < height ; i++) { - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_reg = _mm256_avg_epu8(src_reg, sec_reg); - MERGE_WITH_SRC(src_reg, zero_reg) - sec+= sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - src+= src_stride; - dst+= dst_stride; - } - // x_offset = bilin interpolation and y_offset = 8 - } else if (y_offset == 8) { - __m256i filter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - filter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); - pw8 = _mm256_set1_epi16(8); - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(filter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // average between previous pack to the current - src_pack = _mm256_avg_epu8(src_pack, src_reg); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - sec+= sec_stride; - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; - } - // x_offset = bilin interpolation and y_offset = bilin interpolation - } else { - __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; - x_offset <<= 5; - xfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + x_offset)); - y_offset <<= 5; - yfilter = _mm256_load_si256((__m256i const *) - (bilinear_filters_avx2 + y_offset)); - pw8 = _mm256_set1_epi16(8); - // load source and another source starting from the next - // following byte - src_reg = _mm256_loadu_si256((__m256i const *) (src)); - MERGE_NEXT_SRC(src_reg, 1) - - FILTER_SRC(xfilter) - // convert each 16 bit to 8 bit to each low and high lane source - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - for (i = 0; i < height ; i++) { - src+= src_stride; - LOAD_SRC_DST - MERGE_NEXT_SRC(src_reg, 1) - FILTER_SRC(xfilter) - src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - // merge previous pack to current pack source - MERGE_WITH_SRC(src_pack, src_reg) - // filter the source - FILTER_SRC(yfilter) - src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); - sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); - src_pack = _mm256_avg_epu8(src_pack, sec_reg); - MERGE_WITH_SRC(src_pack, zero_reg) - src_pack = src_reg; - sec+= sec_stride; - CALC_SUM_SSE_INSIDE_LOOP - dst+= dst_stride; - } - } - } - CALC_SUM_AND_SSE - return sum; -} diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c deleted file mode 100644 index 8cd071de5..000000000 --- a/vp9/encoder/x86/vp9_variance_avx2.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vp9/encoder/vp9_variance.h" -#include "vpx_ports/mem.h" - -unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, - int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, - int height, - unsigned int *sse); - -unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - const uint8_t *sec, - int sec_stride, - int height, - unsigned int *sseptr); - -unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - unsigned int sse1; - const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 64, &sse1); - unsigned int sse2; - const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, - x_offset, y_offset, - dst + 32, dst_stride, - 64, &sse2); - const int se = se1 + se2; - *sse = sse1 + sse2; - return *sse - (((int64_t)se * se) >> 12); -} - -unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 32, sse); - return *sse - (((int64_t)se * se) >> 10); -} - -unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - unsigned int *sse, - const uint8_t *sec) { - unsigned int sse1; - const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 64, 64, &sse1); - unsigned int sse2; - const int se2 = - vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, - y_offset, dst + 32, dst_stride, - sec + 32, 64, 64, &sse2); - const int se = se1 + se2; - - *sse = sse1 + sse2; - - return *sse - (((int64_t)se * se) >> 12); -} - -unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - unsigned int *sse, - const uint8_t *sec) { - // processing 32 element in parallel - const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 32, 32, sse); - return *sse - (((int64_t)se * se) >> 10); -} diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c deleted file mode 100644 index 961efe34e..000000000 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // SSE2 - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vp9/encoder/vp9_variance.h" -#include "vpx_ports/mem.h" - -// The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ -int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint8_t *dst, \ - ptrdiff_t dst_stride, \ - int height, unsigned int *sse, \ - void *unused0, void *unused) -#define DECLS(opt1, opt2) \ -DECL(4, opt2); \ -DECL(8, opt1); \ -DECL(16, opt1) - -DECLS(sse2, sse); -DECLS(ssse3, ssse3); -#undef DECLS -#undef DECL - -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ -unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ - int src_stride, \ - int x_offset, \ - int y_offset, \ - const uint8_t *dst, \ - int dst_stride, \ - unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ - x_offset, y_offset, \ - dst + 16, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} - -#define FNS(opt1, opt2) \ -FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ -FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ -FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ -FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ -FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ -FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ -FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ -FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ -FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ -FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ -FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ -FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ -FN(4, 4, 4, 2, 2, opt2, (unsigned int)) - -FNS(sse2, sse); -FNS(ssse3, ssse3); - -#undef FNS -#undef FN - -// The 2 unused parameters are place holders for PIC enabled build. -#define DECL(w, opt) \ -int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ - ptrdiff_t src_stride, \ - int x_offset, int y_offset, \ - const uint8_t *dst, \ - ptrdiff_t dst_stride, \ - const uint8_t *sec, \ - ptrdiff_t sec_stride, \ - int height, unsigned int *sse, \ - void *unused0, void *unused) -#define DECLS(opt1, opt2) \ -DECL(4, opt2); \ -DECL(8, opt1); \ -DECL(16, opt1) - -DECLS(sse2, sse); -DECLS(ssse3, ssse3); -#undef DECL -#undef DECLS - -#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ -unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ - int src_stride, \ - int x_offset, \ - int y_offset, \ - const uint8_t *dst, \ - int dst_stride, \ - unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - sec, w, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ - x_offset, y_offset, \ - dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ - x_offset, y_offset, \ - dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ - x_offset, y_offset, \ - dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, \ - NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - ((cast se * se) >> (wlog2 + hlog2)); \ -} - -#define FNS(opt1, opt2) \ -FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ -FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ -FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ -FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ -FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ -FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ -FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ -FN(16, 8, 16, 4, 3, opt1, (unsigned int)); \ -FN(8, 16, 8, 3, 4, opt1, (unsigned int)); \ -FN(8, 8, 8, 3, 3, opt1, (unsigned int)); \ -FN(8, 4, 8, 3, 2, opt1, (unsigned int)); \ -FN(4, 8, 4, 2, 3, opt2, (unsigned int)); \ -FN(4, 4, 4, 2, 2, opt2, (unsigned int)) - -FNS(sse2, sse); -FNS(ssse3, ssse3); - -#undef FNS -#undef FN diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index c257d91b4..2e43c272a 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -131,7 +131,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_d VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c # common (msa) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 216dc81c9..94cc7ba15 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -58,7 +58,6 @@ VP9_CX_SRCS-yes += encoder/vp9_pickmode.h VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h VP9_CX_SRCS-yes += encoder/vp9_tokenize.h VP9_CX_SRCS-yes += encoder/vp9_treewriter.h -VP9_CX_SRCS-yes += encoder/vp9_variance.h VP9_CX_SRCS-yes += encoder/vp9_mcomp.c VP9_CX_SRCS-yes += encoder/vp9_encoder.c VP9_CX_SRCS-yes += encoder/vp9_picklpf.c @@ -84,7 +83,6 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c -VP9_CX_SRCS-yes += encoder/vp9_variance.c VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c @@ -103,7 +101,6 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) @@ -114,12 +111,6 @@ endif ifeq ($(CONFIG_USE_X86INC),yes) VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm -ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm -endif endif ifeq ($(ARCH_X86_64),yes) @@ -143,14 +134,12 @@ endif VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c -VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c endif VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c @@ -160,6 +149,5 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c -VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vpx_dsp/arm/bilinear_filter_media.asm b/vpx_dsp/arm/bilinear_filter_media.asm new file mode 100644 index 000000000..f3f9754c1 --- /dev/null +++ b/vpx_dsp/arm/bilinear_filter_media.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vpx_filter_block2d_bil_first_pass_media| + EXPORT |vpx_filter_block2d_bil_second_pass_media| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *dst_ptr, +; r2 unsigned int src_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vpx_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vpx_filter_block2d_bil_first_pass_media| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vpx_filter address + ldr r4, [sp, #36] ; width + + mov r12, r3 ; outer-loop counter + + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save dst_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vpx_filter_block2d_bil_first_pass_media| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *dst_ptr, +; r2 int dst_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vpx_filter +;--------------------------------- +|vpx_filter_block2d_bil_second_pass_media| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vpx_filter address + ldr r4, [sp, #36] ; width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vpx_filter_block2d_second_pass_media| + + END diff --git a/vpx_dsp/arm/subpel_variance_media.c b/vpx_dsp/arm/subpel_variance_media.c new file mode 100644 index 000000000..e7d8c85fb --- /dev/null +++ b/vpx_dsp/arm/subpel_variance_media.c @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#if HAVE_MEDIA +static const int16_t bilinear_filters_media[8][2] = { + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 } +}; + +extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr, + uint16_t *dst_ptr, + uint32_t src_pitch, + uint32_t height, + uint32_t width, + const int16_t *filter); + +extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr, + uint8_t *dst_ptr, + int32_t src_pitch, + uint32_t height, + uint32_t width, + const int16_t *filter); + + +unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + uint16_t first_pass[10*8]; + uint8_t second_pass[8*8]; + const int16_t *HFilter, *VFilter; + + HFilter = bilinear_filters_media[xoffset]; + VFilter = bilinear_filters_media[yoffset]; + + vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, + src_pixels_per_line, + 9, 8, HFilter); + vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, + 8, 8, 8, VFilter); + + return vpx_variance8x8_media(second_pass, 8, dst_ptr, + dst_pixels_per_line, sse); +} + +unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + uint16_t first_pass[36*16]; + uint8_t second_pass[20*16]; + const int16_t *HFilter, *VFilter; + unsigned int var; + + if (xoffset == 4 && yoffset == 0) { + var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + sse); + } else if (xoffset == 0 && yoffset == 4) { + var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + sse); + } else if (xoffset == 4 && yoffset == 4) { + var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, + sse); + } else { + HFilter = bilinear_filters_media[xoffset]; + VFilter = bilinear_filters_media[yoffset]; + + vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, + src_pixels_per_line, + 17, 16, HFilter); + vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, + 16, 16, 16, VFilter); + + var = vpx_variance16x16_media(second_pass, 16, dst_ptr, + dst_pixels_per_line, sse); + } + return var; +} +#endif // HAVE_MEDIA diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c similarity index 90% rename from vp9/encoder/arm/neon/vp9_variance_neon.c rename to vpx_dsp/arm/subpel_variance_neon.c index a7f97e991..40e2cc89b 100644 --- a/vp9/encoder/arm/neon/vp9_variance_neon.c +++ b/vpx_dsp/arm/subpel_variance_neon.c @@ -9,14 +9,13 @@ */ #include -#include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" -#include "vp9/common/vp9_filter.h" +#include "vpx_dsp/variance.h" static const uint8_t bilinear_filters[8][2] = { { 128, 0, }, @@ -35,9 +34,9 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const uint8_t *vp9_filter) { - const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); - const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); @@ -58,9 +57,9 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, int pixel_step, unsigned int output_height, unsigned int output_width, - const uint8_t *vp9_filter) { - const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]); - const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]); + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; j += 16) { @@ -80,7 +79,7 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, } } -unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, +unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, @@ -98,7 +97,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src, return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } -unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, +unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, @@ -116,7 +115,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } -unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, +unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, @@ -134,7 +133,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src, return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } -unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src, +unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm similarity index 98% rename from vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm rename to vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm index 3668dc517..dab845a20 100644 --- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm +++ b/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm @@ -9,7 +9,7 @@ ; - EXPORT |vp8_variance_halfpixvar16x16_h_armv6| + EXPORT |vpx_variance_halfpixvar16x16_h_media| ARM REQUIRE8 @@ -22,7 +22,7 @@ ; r2 unsigned char *ref_ptr ; r3 int recon_stride ; stack unsigned int *sse -|vp8_variance_halfpixvar16x16_h_armv6| PROC +|vpx_variance_halfpixvar16x16_h_media| PROC stmfd sp!, {r4-r12, lr} diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm similarity index 98% rename from vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm rename to vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm index b4e0959d1..01953b709 100644 --- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm +++ b/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm @@ -9,7 +9,7 @@ ; - EXPORT |vp8_variance_halfpixvar16x16_hv_armv6| + EXPORT |vpx_variance_halfpixvar16x16_hv_media| ARM REQUIRE8 @@ -22,7 +22,7 @@ ; r2 unsigned char *ref_ptr ; r3 int recon_stride ; stack unsigned int *sse -|vp8_variance_halfpixvar16x16_hv_armv6| PROC +|vpx_variance_halfpixvar16x16_hv_media| PROC stmfd sp!, {r4-r12, lr} diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm similarity index 98% rename from vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm rename to vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm index 10863e2ec..0d17acb38 100644 --- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm +++ b/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm @@ -9,7 +9,7 @@ ; - EXPORT |vp8_variance_halfpixvar16x16_v_armv6| + EXPORT |vpx_variance_halfpixvar16x16_v_media| ARM REQUIRE8 @@ -22,7 +22,7 @@ ; r2 unsigned char *ref_ptr ; r3 int recon_stride ; stack unsigned int *sse -|vp8_variance_halfpixvar16x16_v_armv6| PROC +|vpx_variance_halfpixvar16x16_v_media| PROC stmfd sp!, {r4-r12, lr} diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h index 8410f4bbd..df8141b55 100644 --- a/vpx_dsp/mips/macros_msa.h +++ b/vpx_dsp/mips/macros_msa.h @@ -24,10 +24,34 @@ #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + #if (__mips_isa_rev >= 6) +#define LH(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__ ( \ + "lh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ +}) + #define LW(psrc) ({ \ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint32_t val_m; \ @@ -73,6 +97,18 @@ }) #endif // (__mips == 64) +#define SH(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} + #define SW(val, pdst) { \ uint8_t *pdst_m = (uint8_t *)(pdst); \ const uint32_t val_m = (val); \ @@ -97,6 +133,20 @@ ); \ } #else // !(__mips_isa_rev >= 6) +#define LH(psrc) ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__ ( \ + "ulh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m) \ + ); \ + \ + val_m; \ +}) + #define LW(psrc) ({ \ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ uint32_t val_m; \ @@ -111,18 +161,6 @@ val_m; \ }) -#define SW(val, pdst) { \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ - const uint32_t val_m = (val); \ - \ - __asm__ __volatile__ ( \ - "usw %[val_m], %[pdst_m] \n\t" \ - \ - : [pdst_m] "=m" (*pdst_m) \ - : [val_m] "r" (val_m) \ - ); \ -} - #if (__mips == 64) #define LD(psrc) ({ \ const uint8_t *psrc_m = (const uint8_t *)(psrc); \ @@ -154,6 +192,30 @@ }) #endif // (__mips == 64) +#define SH(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} + +#define SW(val, pdst) { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__ ( \ + "usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m) \ + ); \ +} + #define SD(val, pdst) { \ uint8_t *pdst_m1 = (uint8_t *)(pdst); \ uint32_t val0_m, val1_m; \ @@ -196,6 +258,34 @@ LD2((psrc) + 2 * stride, stride, out2, out3); \ } +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ +} + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ +} + /* Description : Load vectors with 16 byte elements with stride Arguments : Inputs - psrc, stride Outputs - out0, out1 @@ -228,6 +318,14 @@ out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ } #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) + +#define LD_B7(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6) { \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ +} +#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) #define LD_B8(RTYPE, psrc, stride, \ out0, out1, out2, out3, out4, out5, out6, out7) { \ @@ -247,6 +345,7 @@ out0 = LD_H(RTYPE, (psrc)); \ out1 = LD_H(RTYPE, (psrc) + (stride)); \ } +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ LD_H2(RTYPE, (psrc), stride, out0, out1); \ @@ -254,6 +353,229 @@ } #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) +#define LD_H8(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ +} +#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) + +#define LD_H16(RTYPE, psrc, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + out8, out9, out10, out11, out12, out13, out14, out15) { \ + LD_H8(RTYPE, (psrc), stride, \ + out0, out1, out2, out3, out4, out5, out6, out7); \ + LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ + out8, out9, out10, out11, out12, out13, out14, out15); \ +} +#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) + +/* Description : Load 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Input - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH(psrc, out0, out1, out2, out3) { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ +} + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ +} + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ +} +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ +} +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + pdst, stride) { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ +} +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) { \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ +} +#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) + +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) { \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} +#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ +} + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB(in, stidx, pdst, stride) { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ +} + +/* Description : Store 4x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to the GP + register and stored to (pdst) + Index 1 word element from 'in' vector is copied to the GP + register and stored to (pdst + stride) +*/ +#define ST4x2_UB(in, pdst, stride) { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ +} + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ +} +#define ST4x8_UB(in0, in1, pdst, stride) { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ +} + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) +*/ +#define ST8x1_UB(in, pdst) { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ +} + +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ +} + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ +} + /* Description : average with rounding (in0 + in1 + 1) / 2. Arguments : Inputs - in0, in1, in2, in3, Outputs - out0, out1 @@ -275,6 +597,27 @@ } #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slid into 'in0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ +} +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \ + out0, out1, out2, out3, slide_val) { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ +} +#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) + /* Description : Immediate number of elements to slide Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val Outputs - out0, out1 @@ -287,6 +630,148 @@ out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ } #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) +#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \ + out0, out1, out2, slide_val) { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ +} +#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ +} +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) + +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \ + out0, out1, out2, out3) { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ +} +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ +} +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, \ + out0, out1, out2, out3) { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ +} +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ +} +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, \ + out0, out1, out2, out3) { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ +} +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ +} +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ +} +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) /* Description : Dot product & addition of halfword vector elements Arguments : Inputs - mult0, mult1, cnst0, cnst1 @@ -309,7 +794,7 @@ Outputs - out0, out1 Return Type - as per RTYPE Details : Each signed word element from 'mult0' is multiplied with itself - producing an intermediate result twice the size of it + producing an intermediate result twice the size of input i.e. signed double word The multiplication result of adjacent odd-even elements are added to the 'out0' vector @@ -320,6 +805,49 @@ } #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) +/* Description : Minimum values between unsigned elements of + either vector are copied to the output vector + Arguments : Inputs - in0, in1, min_vec + Outputs - in place operation + Return Type - as per RTYPE + Details : Minimum of unsigned halfword element values from 'in0' and + 'min_vec' are written to output vector 'in0' +*/ +#define MIN_UH2(RTYPE, in0, in1, min_vec) { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ +} +#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) + +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ +} +#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ +}) +#define CLIP_SH2_0_255(in0, in1) { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ +} +#define CLIP_SH4_0_255(in0, in1, in2, in3) { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ +} + /* Description : Horizontal addition of 4 signed word elements of input vector Arguments : Input - in (signed word vector) Output - sum_m (i32 sum) @@ -358,11 +886,31 @@ sum_m; \ }) -/* Description : Horizontal subtraction of unsigned byte vector elements +/* Description : Horizontal addition of unsigned byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE - Details : Each unsigned odd byte element from 'in0' is subtracted from + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ +} +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ +} +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from even unsigned byte element from 'in0' (pairwise) and the halfword result is written to 'out0' */ @@ -393,12 +941,32 @@ sad_m; \ }) +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ +} +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + /* Description : Set element n input vector to GPR value Arguments : Inputs - in0, in1, in2, in3 Output - out Return Type - as per RTYPE Details : Set element 0 in vector 'out' to value specified in 'in0' */ +#define INSERT_W2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ +} +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ @@ -415,6 +983,211 @@ #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ +} +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ +} +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) +#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ +} +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ +} +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ +} +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ +} +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ +} +#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ +} +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \ + out4, out5, out6, out7); \ +} +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ +} +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) + +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ +} +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) + +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ +} +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ +} +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) + +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + /* Description : Interleave both left and right half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 @@ -427,13 +1200,138 @@ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ } #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) #define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ } +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ +} +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ +} +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) + +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ +} +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range + The results are written in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ +} +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ +} +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ +} +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \ + out0, out1, out2, out3) { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ +} +#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ +} +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ +} +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + /* Description : Pack even double word elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 @@ -447,6 +1345,7 @@ out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ } #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3) { \ @@ -455,6 +1354,256 @@ } #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128(RTYPE, in0, in1) { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ +} +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ +} +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ +} +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ +} +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) + +/* Description : Average of signed halfword elements -> (a + b) / 2 + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each signed halfword element from 'in0' is added to each + signed halfword element of 'in1' with full precision resulting + in one extra bit in the result. The result is then divided by + 2 and written to 'out0' +*/ +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ +} +#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between halfword data type range +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ +} +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) + +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ +} + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_4V(in0, in1, in2, in3, shift) { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ +} + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ +} + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ +} +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ +} +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ +} +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +#define SRARI_W2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ +} +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ +} +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Logical shift right all elements of vector (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is an immediate value. +*/ +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ +} +#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ +} +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ +} + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ +} + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ +} + /* Description : Sign extend halfword elements from input vector and return the result in pair of vectors Arguments : Input - in (halfword vector) @@ -473,52 +1622,312 @@ ILVRL_H2_SW(tmp_m, in, out0, out1); \ } -/* Description : Store 4 double words with stride +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ +} + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ +} + +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + out8, out9, out10, out11, out12, out13, out14, out15) { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ +} + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ +} +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ +} + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ +} + +/* Description : Transpose 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, \ + tmp0_n, tmp1_n, tmp2_n, tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ +} + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ +} + +/* Description : Transpose 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ + tmp3_m, tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ +} +#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ +} + +/* Description : Add block 4x4 Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Store double word from 'in0' to (pdst) - Store double word from 'in1' to (pdst + stride) - Store double word from 'in2' to (pdst + 2 * stride) - Store double word from 'in3' to (pdst + 3 * stride) + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and stored. */ -#define SD4(in0, in1, in2, in3, pdst, stride) { \ - SD(in0, (pdst)) \ - SD(in1, (pdst) + stride); \ - SD(in2, (pdst) + 2 * stride); \ - SD(in3, (pdst) + 3 * stride); \ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ } -/* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, pdst, stride - Details : Store 8 halfword elements from 'in0' to (pdst) - Store 8 halfword elements from 'in1' to (pdst + stride) +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte */ -#define ST_H2(RTYPE, in0, in1, pdst, stride) { \ - ST_H(RTYPE, in0, (pdst)); \ - ST_H(RTYPE, in1, (pdst) + stride); \ +#define PCKEV_XORI128_UB(in0, in1) ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ +}) + +/* Description : Converts inputs to unsigned bytes, interleave, average & store + as 8x4 unsigned byte block + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, + pdst, stride +*/ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ + dst0, dst1, dst2, dst3, pdst, stride) { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ } -#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) -/* Description : Store 8x4 byte block to destination memory from input - vectors - Arguments : Inputs - in0, in1, pdst, stride - Details : Index 0 double word element from 'in0' vector is copied to the - GP register and stored to (pdst) - Index 1 double word element from 'in0' vector is copied to the - GP register and stored to (pdst + stride) - Index 0 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 2 * stride) - Index 1 double word element from 'in1' vector is copied to the - GP register and stored to (pdst + 3 * stride) +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst */ -#define ST8x4_UB(in0, in1, pdst, stride) { \ - uint64_t out0_m, out1_m, out2_m, out3_m; \ - uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ - \ - out0_m = __msa_copy_u_d((v2i64)in0, 0); \ - out1_m = __msa_copy_u_d((v2i64)in0, 1); \ - out2_m = __msa_copy_u_d((v2i64)in1, 0); \ - out3_m = __msa_copy_u_d((v2i64)in1, 1); \ - \ - SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ +#define PCKEV_ST_SB(in0, in1, pdst) { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ } + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + tmp1_m = __msa_sat_u_h(tmp1_m, shift); \ + \ + tmp1_m; \ +}) #endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */ diff --git a/vp9/encoder/mips/msa/vp9_variance_msa.c b/vpx_dsp/mips/sub_pixel_variance_msa.c similarity index 96% rename from vp9/encoder/mips/msa/vp9_variance_msa.c rename to vpx_dsp/mips/sub_pixel_variance_msa.c index edae5101b..51681926f 100644 --- a/vp9/encoder/mips/msa/vp9_variance_msa.c +++ b/vpx_dsp/mips/sub_pixel_variance_msa.c @@ -8,13 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_filter.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" +#include "vpx_dsp/mips/macros_msa.h" +#include "vpx_dsp/variance.h" -static const uint8_t bilinear_filters[8][2] = { +static const uint8_t bilinear_filters_msa[8][2] = { { 128, 0, }, { 112, 16, }, { 96, 32, }, @@ -707,8 +706,8 @@ static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src, #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); -#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ -uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \ +#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ +uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \ int32_t src_stride, \ int32_t xoffset, \ int32_t yoffset, \ @@ -717,8 +716,8 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \ uint32_t *sse) { \ int32_t diff; \ uint32_t var; \ - const uint8_t *h_filter = bilinear_filters[xoffset]; \ - const uint8_t *v_filter = bilinear_filters[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ \ if (yoffset) { \ if (xoffset) { \ @@ -749,20 +748,20 @@ uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src, \ return var; \ } -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); -VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32); +VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64); diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index 084dd7b7e..e8bddb0a0 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -14,13 +14,26 @@ #include "vpx_ports/mem.h" #include "vpx/vpx_integer.h" -unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride, - const unsigned char *b, int b_stride) { +#include "vpx_dsp/variance.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, + { 112, 16 }, + { 96, 32 }, + { 80, 48 }, + { 64, 64 }, + { 48, 80 }, + { 32, 96 }, + { 16, 112 }, +}; + +uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride) { int distortion = 0; int r, c; - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { int diff = a[c] - b[c]; distortion += diff * diff; } @@ -32,7 +45,7 @@ unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride, return distortion; } -unsigned int vpx_get_mb_ss_c(const int16_t *a) { +uint32_t vpx_get_mb_ss_c(const int16_t *a) { unsigned int i, sum = 0; for (i = 0; i < 256; ++i) { @@ -42,16 +55,38 @@ unsigned int vpx_get_mb_ss_c(const int16_t *a) { return sum; } +uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0, + b, b_stride, sse); +} + + +uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4, + b, b_stride, sse); +} + +uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4, + b, b_stride, sse); +} + static void variance(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, - int w, int h, unsigned int *sse, int *sum) { + int w, int h, uint32_t *sse, int *sum) { int i, j; *sum = 0; *sse = 0; - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { const int diff = a[j] - b[j]; *sum += diff; *sse += diff * diff; @@ -62,15 +97,113 @@ static void variance(const uint8_t *a, int a_stride, } } +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] + + (int)a[pixel_step] * filter[1], + FILTER_BITS); + + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] + + (int)a[pixel_step] * filter[1], + FILTER_BITS); + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + #define VAR(W, H) \ -unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ +uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ int sum; \ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse - (((int64_t)sum * sum) / (W * H)); \ } +#define SUBPIX_VAR(W, H) \ +uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ +\ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +} + +#define SUBPIX_AVG_VAR(W, H) \ +uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + int xoffset, int yoffset, \ + const uint8_t *b, \ + int b_stride, \ + uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ +\ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ +\ + return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ +} + /* Identical to the variance call except it takes an additional parameter, sum, * and returns that value using pass-by-reference instead of returning * sse - sum^2 / w*h @@ -78,7 +211,7 @@ unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ #define GET_VAR(W, H) \ void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride, \ - unsigned int *sse, int *sum) { \ + uint32_t *sse, int *sum) { \ variance(a, a_stride, b, b_stride, W, H, sse, sum); \ } @@ -87,27 +220,33 @@ void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ * variable. */ #define MSE(W, H) \ -unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ - const uint8_t *b, int b_stride, \ - unsigned int *sse) { \ +uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ int sum; \ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse; \ } -VAR(64, 64) -VAR(64, 32) -VAR(32, 64) -VAR(32, 32) -VAR(32, 16) -VAR(16, 32) -VAR(16, 16) -VAR(16, 8) -VAR(8, 16) -VAR(8, 8) -VAR(8, 4) -VAR(4, 8) -VAR(4, 4) +/* All three forms of the variance are available in the same sizes. */ +#define VARIANCES(W, H) \ + VAR(W, H) \ + SUBPIX_VAR(W, H) \ + SUBPIX_AVG_VAR(W, H) + +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) GET_VAR(16, 16) GET_VAR(8, 8) @@ -117,12 +256,13 @@ MSE(16, 8) MSE(8, 16) MSE(8, 8) -void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride) { +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, + const uint8_t *ref, int ref_stride) { int i, j; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); } @@ -143,8 +283,8 @@ static void highbd_variance64(const uint8_t *a8, int a_stride, *sum = 0; *sse = 0; - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { const int diff = a[j] - b[j]; *sum += diff; *sse += diff * diff; @@ -156,60 +296,60 @@ static void highbd_variance64(const uint8_t *a8, int a_stride, static void highbd_8_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, int *sum) { + int w, int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; uint64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)sse_long; + *sse = (uint32_t)sse_long; *sum = (int)sum_long; } static void highbd_10_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, int *sum) { + int w, int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; uint64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); } static void highbd_12_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, - int w, int h, unsigned int *sse, int *sum) { + int w, int h, uint32_t *sse, int *sum) { uint64_t sse_long = 0; uint64_t sum_long = 0; highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } #define HIGHBD_VAR(W, H) \ -unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ +uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + uint32_t *sse) { \ int sum; \ highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse - (((int64_t)sum * sum) / (W * H)); \ } \ \ -unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ +uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + uint32_t *sse) { \ int sum; \ highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse - (((int64_t)sum * sum) / (W * H)); \ } \ \ -unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ - int a_stride, \ - const uint8_t *b, \ - int b_stride, \ - unsigned int *sse) { \ +uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ + int a_stride, \ + const uint8_t *b, \ + int b_stride, \ + uint32_t *sse) { \ int sum; \ highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ return *sse - (((int64_t)sum * sum) / (W * H)); \ @@ -217,54 +357,243 @@ unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \ #define HIGHBD_GET_VAR(S) \ void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ } \ \ void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ + uint32_t *sse, int *sum) { \ highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ } \ \ void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ const uint8_t *ref, int ref_stride, \ - unsigned int *sse, int *sum) { \ + uint32_t *sse, int *sum) { \ highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ } #define HIGHBD_MSE(W, H) \ -unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ +uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + uint32_t *sse) { \ int sum; \ highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ return *sse; \ } \ \ -unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ +uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + uint32_t *sse) { \ int sum; \ highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ return *sse; \ } \ \ -unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - unsigned int *sse) { \ +uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ + int src_stride, \ + const uint8_t *ref, \ + int ref_stride, \ + uint32_t *sse) { \ int sum; \ highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ return *sse; \ } +static void highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = + ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] + + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = + ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] + + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ +uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ +\ + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[xoffset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \ + dst_stride, sse); \ +} \ +\ +uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ +\ + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[xoffset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, dst, dst_stride, sse); \ +} \ +\ +uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ +\ + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[xoffset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, dst, dst_stride, sse); \ +} + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ +uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ +\ + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[xoffset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ +\ + return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ +} \ +\ +uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ +\ + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[xoffset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ +\ + return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + W, dst, dst_stride, sse); \ +} \ +\ +uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, \ + int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, \ + uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ +\ + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters[xoffset]); \ + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters[yoffset]); \ +\ + vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ +\ + return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ + W, dst, dst_stride, sse); \ +} + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) + HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) @@ -273,28 +602,14 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -HIGHBD_VAR(64, 64) -HIGHBD_VAR(64, 32) -HIGHBD_VAR(32, 64) -HIGHBD_VAR(32, 32) -HIGHBD_VAR(32, 16) -HIGHBD_VAR(16, 32) -HIGHBD_VAR(16, 16) -HIGHBD_VAR(16, 8) -HIGHBD_VAR(8, 16) -HIGHBD_VAR(8, 8) -HIGHBD_VAR(8, 4) -HIGHBD_VAR(4, 8) -HIGHBD_VAR(4, 4) - void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); } diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h new file mode 100644 index 000000000..c18d9b48f --- /dev/null +++ b/vpx_dsp/variance.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_VARIANCE_H_ +#define VPX_DSP_VARIANCE_H_ + +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 +#define FILTER_WEIGHT 128 + +typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b_ptr, int b_stride); + +typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *second_pred); + +typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, + uint8_t *b, int b_stride, int n); + +typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sad_array); + +typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *const b_array[], + int b_stride, + unsigned int *sad_array); + +typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr, + int a_stride, + int xoffset, int yoffset, + const uint8_t *b_ptr, + int b_stride, + unsigned int *sse, + const uint8_t *second_pred); +#if CONFIG_VP8 +typedef struct variance_vtable { + vpx_sad_fn_t sdf; + vpx_variance_fn_t vf; + vpx_subpixvariance_fn_t svf; + vpx_variance_fn_t svf_halfpix_h; + vpx_variance_fn_t svf_halfpix_v; + vpx_variance_fn_t svf_halfpix_hv; + vpx_sad_multi_fn_t sdx3f; + vpx_sad_multi_fn_t sdx8f; + vpx_sad_multi_d_fn_t sdx4df; +#if ARCH_X86 || ARCH_X86_64 + vp8_copy32xn_fn_t copymem; +#endif +} vp8_variance_fn_ptr_t; +#endif // CONFIG_VP8 + +#if CONFIG_VP9 +typedef struct vp9_variance_vtable { + vpx_sad_fn_t sdf; + vpx_sad_avg_fn_t sdaf; + vpx_variance_fn_t vf; + vpx_subpixvariance_fn_t svf; + vpx_subp_avg_variance_fn_t svaf; + vpx_sad_multi_fn_t sdx3f; + vpx_sad_multi_fn_t sdx8f; + vpx_sad_multi_d_fn_t sdx4df; +} vp9_variance_fn_ptr_t; +#endif // CONFIG_VP9 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_DSP_VARIANCE_H_ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index ecfaf4e01..70a131ced 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -10,6 +10,8 @@ DSP_SRCS-yes += vpx_dsp.mk +DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h + ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c @@ -19,7 +21,6 @@ DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c -DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c @@ -45,21 +46,36 @@ endif # CONFIG_ENCODERS ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) DSP_SRCS-yes += variance.c +DSP_SRCS-yes += variance.h +DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM) DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm -DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c +ifeq ($(CONFIG_USE_X86INC),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3 +endif # CONFIG_USE_X86INC + ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm +ifeq ($(CONFIG_USE_X86INC),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +endif # CONFIG_USE_X86INC endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 61bd74954..8e4e96634 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -412,6 +412,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { +# +# Variance +# add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance64x64 sse2 avx2 neon msa/; @@ -451,7 +454,9 @@ add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_ add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance4x4 mmx sse2 msa/; - +# +# Specialty Variance +# add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; specialize qw/vpx_get16x16var sse2 avx2 neon msa/; @@ -478,6 +483,99 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; +# +# Subpixel Variance +# +add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc"; + +add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc"; + +# +# Specialty Subpixel +# +add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_variance_halfpixvar16x16_h mmx media/; + +add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_variance_halfpixvar16x16_v mmx media/; + +add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/; + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_highbd_12_variance64x64 sse2/; @@ -615,6 +713,226 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_12_mse8x8 sse2/; add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + + # + # Subpixel Variance + # + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; + + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + } # CONFIG_VP9_HIGHBITDEPTH } # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm similarity index 99% rename from vp9/encoder/x86/vp9_highbd_subpel_variance.asm rename to vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 4594bb1aa..2be64460f 100644 --- a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -8,6 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; +%define program_name vpx + %include "third_party/x86inc/x86inc.asm" SECTION_RODATA @@ -30,7 +32,7 @@ bilin_filter_m_sse2: times 8 dw 16 SECTION .text -; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, ; const uint8_t *dst, ptrdiff_t dst_stride, ; int height, unsigned int *sse); diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index 343c0478b..fe35c1e86 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -8,9 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ #include "./vpx_config.h" -#include "vp9/common/vp9_common.h" -#include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, @@ -243,3 +241,341 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, sse, &sum, vpx_highbd_calc8x8var_sse2, 8); return *sse; } + +#if CONFIG_USE_X86INC +#define DECL(w, opt) \ + int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint16_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse); +#define DECLS(opt1, opt2) \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse); +// TODO(johannkoenig): enable the ssse3 or delete +// DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, h, \ + &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ + src_stride, \ + x_offset, y_offset, \ + dst + 16, \ + dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, \ + dst + 48, dst_stride, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + h, &sse); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ + src_stride, \ + x_offset, y_offset, \ + dst + 16, \ + dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + for (start_row = 0; start_row < h; start_row +=16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + }\ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ +FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ +FN(8, 16, 8, 3, 4, opt1, (int64_t)); \ +FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ +FN(8, 4, 8, 3, 2, opt1, (int64_t)); + + +FNS(sse2, sse); + +#undef FNS +#undef FN + +#define DECL(w, opt) \ +int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint16_t *dst, \ + ptrdiff_t dst_stride, \ + const uint16_t *sec, \ + ptrdiff_t sec_stride, \ + int height, \ + unsigned int *sse); +#define DECLS(opt1) \ +DECL(16, opt1) \ +DECL(8, opt1) + +DECLS(sse2); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, \ + y_offset, dst, dst_stride, sec, w, h, &sse); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, \ + dst + 16, dst_stride, sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, \ + dst + 32, dst_stride, sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, \ + dst + 48, dst_stride, sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +\ +uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row +=16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + (start_row * dst_stride), dst_stride, \ + sec + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 16 + (start_row * dst_stride), dst_stride, \ + sec + 16 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 32 + (start_row * dst_stride), dst_stride, \ + sec + 32 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 48 + (start_row * dst_stride), dst_stride, \ + sec + 48 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + + +#define FNS(opt1) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ +FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ +FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ +FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ +FN(8, 4, 8, 3, 2, opt1, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN +#endif // CONFIG_USE_X86INC diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vpx_dsp/x86/subpel_variance_sse2.asm similarity index 99% rename from vp9/encoder/x86/vp9_subpel_variance.asm rename to vpx_dsp/x86/subpel_variance_sse2.asm index 292cf34d1..294f54f9c 100644 --- a/vp9/encoder/x86/vp9_subpel_variance.asm +++ b/vpx_dsp/x86/subpel_variance_sse2.asm @@ -8,6 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; +%define program_name vpx + %include "third_party/x86inc/x86inc.asm" SECTION_RODATA @@ -39,7 +41,7 @@ bilin_filter_m_ssse3: times 8 db 16, 0 SECTION .text -; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, ; int x_offset, int y_offset, ; const uint8_t *dst, ptrdiff_t dst_stride, ; int height, unsigned int *sse); diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c index 82cef4af0..7851a98b1 100644 --- a/vpx_dsp/x86/variance_avx2.c +++ b/vpx_dsp/x86/variance_avx2.c @@ -91,3 +91,93 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, sse, &sum, vpx_get32x32var_avx2, 32); return *sse - (((int64_t)sum * sum) >> 11); } + +unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, + unsigned int *sse); + +unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + const uint8_t *sec, + int sec_stride, + int height, + unsigned int *sseptr); + +unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + unsigned int sse1; + const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 64, &sse1); + unsigned int sse2; + const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, + x_offset, y_offset, + dst + 32, dst_stride, + 64, &sse2); + const int se = se1 + se2; + *sse = sse1 + sse2; + return *sse - (((int64_t)se * se) >> 12); +} + +unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 32, sse); + return *sse - (((int64_t)se * se) >> 10); +} + +unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse, + const uint8_t *sec) { + unsigned int sse1; + const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 64, 64, &sse1); + unsigned int sse2; + const int se2 = + vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, + y_offset, dst + 32, dst_stride, + sec + 32, 64, 64, &sse2); + const int se = se1 + se2; + + *sse = sse1 + sse2; + + return *sse - (((int64_t)se * se) >> 12); +} + +unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse, + const uint8_t *sec) { + // Process 32 elements in parallel. + const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 32, 32, sse); + return *sse - (((int64_t)se * se) >> 10); +} diff --git a/vpx_dsp/x86/variance_impl_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c index 0e40959aa..b289e9a0c 100644 --- a/vpx_dsp/x86/variance_impl_avx2.c +++ b/vpx_dsp/x86/variance_impl_avx2.c @@ -11,6 +11,27 @@ #include // AVX2 #include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; + void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, @@ -213,3 +234,494 @@ void vpx_get32x32var_avx2(const unsigned char *src_ptr, _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); } } + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define MERGE_WITH_SRC(src_reg, reg) \ + exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ + exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); + +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); + +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *) \ + (src + size_stride)); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *) \ + (src + size_stride)); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + + +unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + int height, + unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + // save current source average + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + } + CALC_SUM_AND_SSE + return sum; +} + +unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + const uint8_t *sec, + int sec_stride, + int height, + unsigned int *sse) { + __m256i sec_reg; + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec+= sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + // save current source average + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + sec+= sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height ; i++) { + // save current source average + src_avg = src_reg; + src+= src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + sec+= sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height ; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + MERGE_WITH_SRC(src_reg, zero_reg) + sec+= sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + src+= src_stride; + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + sec+= sec_stride; + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256((__m256i const *) + (bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *) (src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height ; i++) { + src+= src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + sec+= sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst+= dst_stride; + } + } + } + CALC_SUM_AND_SSE + return sum; +} diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm index a8d7d99db..b8ba79b65 100644 --- a/vpx_dsp/x86/variance_impl_mmx.asm +++ b/vpx_dsp/x86/variance_impl_mmx.asm @@ -11,6 +11,8 @@ %include "vpx_ports/x86_abi_support.asm" +%define mmx_filter_shift 7 + ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) global sym(vpx_get_mb_ss_mmx) PRIVATE sym(vpx_get_mb_ss_mmx): @@ -52,7 +54,6 @@ sym(vpx_get_mb_ss_mmx): movsxd rcx, dword ptr [rsp+4] add rax, rcx - ; begin epilog add rsp, 8 pop rdi @@ -62,7 +63,6 @@ sym(vpx_get_mb_ss_mmx): pop rbp ret - ;void vpx_get8x8var_mmx ;( ; unsigned char *src_ptr, @@ -83,7 +83,6 @@ sym(vpx_get8x8var_mmx): sub rsp, 16 ; end prolog - pxor mm5, mm5 ; Blank mmx6 pxor mm6, mm6 ; Blank mmx7 pxor mm7, mm7 ; Blank mmx7 @@ -117,7 +116,6 @@ sym(vpx_get8x8var_mmx): paddd mm7, mm0 ; accumulate in mm7 paddd mm7, mm2 ; accumulate in mm7 - ; Row 2 movq mm0, [rax] ; Copy eight bytes to mm0 movq mm2, mm0 ; Take copies @@ -298,7 +296,6 @@ sym(vpx_get8x8var_mmx): mov dword ptr [rdi], edx xor rax, rax ; return 0 - ; begin epilog add rsp, 16 pop rbx @@ -308,8 +305,6 @@ sym(vpx_get8x8var_mmx): pop rbp ret - - ;void ;vpx_get4x4var_mmx ;( @@ -331,7 +326,6 @@ sym(vpx_get4x4var_mmx): sub rsp, 16 ; end prolog - pxor mm5, mm5 ; Blank mmx6 pxor mm6, mm6 ; Blank mmx7 pxor mm7, mm7 ; Blank mmx7 @@ -354,7 +348,6 @@ sym(vpx_get4x4var_mmx): movd mm1, [rbx] ; Copy four bytes to mm1 paddd mm7, mm0 ; accumulate in mm7 - ; Row 2 movd mm0, [rax] ; Copy four bytes to mm0 punpcklbw mm0, mm6 ; unpack to higher prrcision @@ -393,7 +386,6 @@ sym(vpx_get4x4var_mmx): pmaddwd mm0, mm0 ; square and accumulate paddd mm7, mm0 ; accumulate in mm7 - ; Now accumulate the final results. movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory @@ -413,7 +405,6 @@ sym(vpx_get4x4var_mmx): mov dword ptr [rdi], edx xor rax, rax ; return 0 - ; begin epilog add rsp, 16 pop rbx @@ -422,3 +413,332 @@ sym(vpx_get4x4var_mmx): UNSHADOW_ARGS pop rbp ret + +;void vpx_filter_block2d_bil4x4_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE +sym(vpx_filter_block2d_bil4x4_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + + mov rax, arg(4) ;HFilter ; + mov rdx, arg(5) ;VFilter ; + + mov rsi, arg(0) ;ref_ptr ; + mov rdi, arg(2) ;src_ptr ; + + mov rcx, 4 ; + pxor mm0, mm0 ; + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm5, mm1 + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + add rsi, r8 +%endif + +.filter_block2d_bil4x4_var_mmx_loop: + + movd mm1, [rsi] ; + movd mm3, [rsi+1] ; + + punpcklbw mm1, mm0 ; + pmullw mm1, [rax] ; + + punpcklbw mm3, mm0 ; + pmullw mm3, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + movq mm3, mm5 ; + + movq mm5, mm1 ; + pmullw mm3, [rdx] ; + + pmullw mm1, [rdx+8] ; + paddw mm1, mm3 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + movd mm3, [rdi] ; + punpcklbw mm3, mm0 ; + + psubw mm1, mm3 ; + paddw mm6, mm1 ; + + pmaddwd mm1, mm1 ; + paddd mm7, mm1 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + movsxd r9, dword ptr arg(3) ;src_pixels_per_line + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil4x4_var_mmx_loop ; + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(6) ;sum + mov rsi, arg(7) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vpx_filter_block2d_bil_var_mmx +;( +; unsigned char *ref_ptr, +; int ref_pixels_per_line, +; unsigned char *src_ptr, +; int src_pixels_per_line, +; unsigned int Height, +; unsigned short *HFilter, +; unsigned short *VFilter, +; int *sum, +; unsigned int *sumsquared +;) +global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE +sym(vpx_filter_block2d_bil_var_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + GET_GOT rbx + push rsi + push rdi + sub rsp, 16 + ; end prolog + + pxor mm6, mm6 ; + pxor mm7, mm7 ; + mov rax, arg(5) ;HFilter ; + + mov rdx, arg(6) ;VFilter ; + mov rsi, arg(0) ;ref_ptr ; + + mov rdi, arg(2) ;src_ptr ; + movsxd rcx, dword ptr arg(4) ;Height ; + + pxor mm0, mm0 ; + movq mm1, [rsi] ; + + movq mm3, [rsi+1] ; + movq mm2, mm1 ; + + movq mm4, mm3 ; + punpcklbw mm1, mm0 ; + + punpckhbw mm2, mm0 ; + pmullw mm1, [rax] ; + + pmullw mm2, [rax] ; + punpcklbw mm3, mm0 ; + + punpckhbw mm4, mm0 ; + pmullw mm3, [rax+8] ; + + pmullw mm4, [rax+8] ; + paddw mm1, mm3 ; + + paddw mm2, mm4 ; + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm2, mmx_filter_shift ; + movq mm5, mm1 + + packuswb mm5, mm2 ; +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line + add rsi, r8 +%endif + +.filter_block2d_bil_var_mmx_loop: + + movq mm1, [rsi] ; + movq mm3, [rsi+1] ; + + movq mm2, mm1 ; + movq mm4, mm3 ; + + punpcklbw mm1, mm0 ; + punpckhbw mm2, mm0 ; + + pmullw mm1, [rax] ; + pmullw mm2, [rax] ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + pmullw mm3, [rax+8] ; + pmullw mm4, [rax+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + psraw mm1, mmx_filter_shift ; + + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + psraw mm2, mmx_filter_shift ; + + movq mm3, mm5 ; + movq mm4, mm5 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + movq mm5, mm1 ; + packuswb mm5, mm2 ; + + pmullw mm3, [rdx] ; + pmullw mm4, [rdx] ; + + pmullw mm1, [rdx+8] ; + pmullw mm2, [rdx+8] ; + + paddw mm1, mm3 ; + paddw mm2, mm4 ; + + paddw mm1, [GLOBAL(mmx_bi_rd)] ; + paddw mm2, [GLOBAL(mmx_bi_rd)] ; + + psraw mm1, mmx_filter_shift ; + psraw mm2, mmx_filter_shift ; + + movq mm3, [rdi] ; + movq mm4, mm3 ; + + punpcklbw mm3, mm0 ; + punpckhbw mm4, mm0 ; + + psubw mm1, mm3 ; + psubw mm2, mm4 ; + + paddw mm6, mm1 ; + pmaddwd mm1, mm1 ; + + paddw mm6, mm2 ; + pmaddwd mm2, mm2 ; + + paddd mm7, mm1 ; + paddd mm7, mm2 ; + +%if ABI_IS_32BIT + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; + add rdi, dword ptr arg(3) ;src_pixels_per_line ; +%else + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; + add rsi, r8 + add rdi, r9 +%endif + sub rcx, 1 ; + jnz .filter_block2d_bil_var_mmx_loop ; + + pxor mm3, mm3 ; + pxor mm2, mm2 ; + + punpcklwd mm2, mm6 ; + punpckhwd mm3, mm6 ; + + paddd mm2, mm3 ; + movq mm6, mm2 ; + + psrlq mm6, 32 ; + paddd mm2, mm6 ; + + psrad mm2, 16 ; + movq mm4, mm7 ; + + psrlq mm4, 32 ; + paddd mm4, mm7 ; + + mov rdi, arg(7) ;sum + mov rsi, arg(8) ;sumsquared + + movd dword ptr [rdi], mm2 ; + movd dword ptr [rsi], mm4 ; + + ; begin epilog + add rsp, 16 + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; +align 16 +mmx_bi_rd: + times 4 dw 64 diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c index 99dd741bc..f04f4e2c8 100644 --- a/vpx_dsp/x86/variance_mmx.c +++ b/vpx_dsp/x86/variance_mmx.c @@ -10,12 +10,45 @@ #include "./vpx_dsp_rtcd.h" +#include "vpx_ports/mem.h" + +DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = { + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 16, 16, 16, 16, 112, 112, 112, 112 } +}; + extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, unsigned int *sse, int *sum); -unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, - const unsigned char *b, int b_stride, +extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + const int16_t *HFilter, + const int16_t *VFilter, + int *sum, + unsigned int *sumsquared); + +extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + const int16_t *HFilter, + const int16_t *VFilter, + int *sum, + unsigned int *sumsquared); + + +unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, unsigned int *sse) { unsigned int var; int avg; @@ -25,8 +58,8 @@ unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride, return (var - (((unsigned int)avg * avg) >> 4)); } -unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, - const unsigned char *b, int b_stride, +unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, unsigned int *sse) { unsigned int var; int avg; @@ -37,8 +70,8 @@ unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride, return (var - (((unsigned int)avg * avg) >> 6)); } -unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, - const unsigned char *b, int b_stride, +unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, unsigned int *sse) { unsigned int sse0, sse1, sse2, sse3, var; int sum0, sum1, sum2, sum3; @@ -55,8 +88,8 @@ unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride, return var; } -unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, - const unsigned char *b, int b_stride, +unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, unsigned int *sse) { unsigned int sse0, sse1, sse2, sse3, var; int sum0, sum1, sum2, sum3, avg; @@ -74,8 +107,8 @@ unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride, return (var - (((unsigned int)avg * avg) >> 8)); } -unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, - const unsigned char *b, int b_stride, +unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, unsigned int *sse) { unsigned int sse0, sse1, var; int sum0, sum1, avg; @@ -89,8 +122,8 @@ unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride, return (var - (((unsigned int)avg * avg) >> 7)); } -unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, - const unsigned char *b, int b_stride, +unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, + const unsigned char *b, int b_stride, unsigned int *sse) { unsigned int sse0, sse1, var; int sum0, sum1, avg; @@ -105,3 +138,112 @@ unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride, return (var - (((unsigned int)avg * avg) >> 7)); } + +uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int xsum; + unsigned int xxsum; + vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride, + bilinear_filters_mmx[xoffset], + bilinear_filters_mmx[yoffset], + &xsum, &xxsum); + *sse = xxsum; + return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +} + + +uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int xsum; + uint32_t xxsum; + vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8, + bilinear_filters_mmx[xoffset], + bilinear_filters_mmx[yoffset], + &xsum, &xxsum); + *sse = xxsum; + return (xxsum - (((uint32_t)xsum * xsum) >> 6)); +} + +uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16, + bilinear_filters_mmx[xoffset], + bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0); + + vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16, + bilinear_filters_mmx[xoffset], + bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8)); +} + +uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8, + bilinear_filters_mmx[xoffset], + bilinear_filters_mmx[yoffset], + &xsum0, &xxsum0); + + vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8, + bilinear_filters_mmx[xoffset], + bilinear_filters_mmx[yoffset], + &xsum1, &xxsum1); + + xsum0 += xsum1; + xxsum0 += xxsum1; + + *sse = xxsum0; + return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7)); +} + +uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int xsum; + unsigned int xxsum; + vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16, + bilinear_filters_mmx[xoffset], + bilinear_filters_mmx[yoffset], + &xsum, &xxsum); + *sse = xxsum; + return (xxsum - (((uint32_t)xsum * xsum) >> 7)); +} + +uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse); +} + +uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse); +} + +uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse); +} diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c index 6256bc536..e6c9365ab 100644 --- a/vpx_dsp/x86/variance_sse2.c +++ b/vpx_dsp/x86/variance_sse2.c @@ -307,3 +307,171 @@ unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride, vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); return *sse; } + +#if CONFIG_USE_X86INC +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt1, opt2) \ + DECL(4, opt2); \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sse_ptr) { \ + unsigned int sse; \ + int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + h, &sse, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \ +FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \ +FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \ +FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \ +FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \ +FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \ +FN(4, 4, 4, 2, 2, opt2, (uint32_t)) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ +int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + const uint8_t *sec, \ + ptrdiff_t sec_stride, \ + int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt1, opt2) \ +DECL(4, opt2); \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sseptr, \ + const uint8_t *sec) { \ + unsigned int sse; \ + int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, \ + NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, \ + NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, \ + NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sseptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \ +FN(16, 8, 16, 4, 3, opt1, (uint32_t)); \ +FN(8, 16, 8, 3, 4, opt1, (uint32_t)); \ +FN(8, 8, 8, 3, 3, opt1, (uint32_t)); \ +FN(8, 4, 8, 3, 2, opt1, (uint32_t)); \ +FN(4, 8, 4, 2, 3, opt2, (uint32_t)); \ +FN(4, 4, 4, 2, 2, opt2, (uint32_t)) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN +#endif // CONFIG_USE_X86INC -- 2.40.0