From: Jian Zhou Date: Tue, 24 Nov 2015 20:16:05 +0000 (-0800) Subject: SSE2 speed up of h_predictor_4x4 X-Git-Tag: v1.6.0~517^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9d29d7628062d8db6795c73eccbbbf2c3df41e00;p=libvpx SSE2 speed up of h_predictor_4x4 Relocate h_predictor_4x4 from SSSE3 to SSE2 with XMM registers. Speed up by ~25% in ./test_intra_pred_speed. Change-Id: I64e14c13b482a471449be3559bfb0da45cf88d9d --- diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index d44a64a0b..58d10976f 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -195,15 +195,16 @@ INTRA_PRED_TEST(SSE, TestIntraPred4, vpx_dc_predictor_4x4_sse, #endif // HAVE_SSE && CONFIG_USE_X86INC #if HAVE_SSE2 && CONFIG_USE_X86INC -INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_4x4_sse2) +INTRA_PRED_TEST(SSE2, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, + vpx_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL, + vpx_tm_predictor_4x4_sse2) #endif // HAVE_SSE2 && CONFIG_USE_X86INC #if HAVE_SSSE3 && CONFIG_USE_X86INC INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, - vpx_h_predictor_4x4_ssse3, vpx_d45_predictor_4x4_ssse3, NULL, - NULL, vpx_d153_predictor_4x4_ssse3, - vpx_d207_predictor_4x4_ssse3, vpx_d63_predictor_4x4_ssse3, NULL) + NULL, vpx_d45_predictor_4x4_ssse3, NULL, NULL, + vpx_d153_predictor_4x4_ssse3, vpx_d207_predictor_4x4_ssse3, + vpx_d63_predictor_4x4_ssse3, NULL) #endif // HAVE_SSSE3 && CONFIG_USE_X86INC #if HAVE_DSPR2 diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4de85a431..d11b32778 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -76,7 +76,7 @@ add_proto qw/void vpx_d63f_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co specialize qw/vpx_d63f_predictor_4x4/; add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc"; +specialize qw/vpx_h_predictor_4x4 neon dspr2 msa/, "$sse2_x86inc"; add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_he_predictor_4x4/; diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm index 04b39a583..5ef7ae313 100644 --- a/vpx_dsp/x86/intrapred_sse2.asm +++ b/vpx_dsp/x86/intrapred_sse2.asm @@ -515,6 +515,22 @@ cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above jnz .loop REP_RET +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + INIT_XMM sse2 cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left pxor m1, m1 diff --git a/vpx_dsp/x86/intrapred_ssse3.asm b/vpx_dsp/x86/intrapred_ssse3.asm index 88df9b2d1..744c40c8b 100644 --- a/vpx_dsp/x86/intrapred_ssse3.asm +++ b/vpx_dsp/x86/intrapred_ssse3.asm @@ -33,24 +33,6 @@ sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 SECTION .text -INIT_MMX ssse3 -cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left - movifnidn leftq, leftmp - add leftq, 4 - mov lineq, -2 - pxor m0, m0 -.loop: - movd m1, [leftq+lineq*2 ] - movd m2, [leftq+lineq*2+1] - pshufb m1, m0 - pshufb m2, m0 - movd [dstq ], m1 - movd [dstq+strideq], m2 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - INIT_MMX ssse3 cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left movifnidn leftq, leftmp