From 88120481a4475c1b40f867b8d80edfd2a560a315 Mon Sep 17 00:00:00 2001 From: Jian Zhou Date: Thu, 10 Dec 2015 17:25:18 -0800 Subject: [PATCH] Code clean of tm_predictor_32x32 Reallocate the xmm register usage so that no ARCH_X86_64 required. Reduce memory access to the left neighbor by half. Speed up by single digit on big core machine. Change-Id: I392515ed8e8aeb02e6a717b3966b1ba13f5be990 --- test/test_intra_pred_speed.cc | 9 ------- vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- vpx_dsp/x86/intrapred_sse2.asm | 43 ++++++++++++++++------------------ 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 4064ea645..3e65fecfb 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -337,21 +337,12 @@ INTRA_PRED_TEST(C, TestIntraPred32, vpx_dc_predictor_32x32_c, vpx_d63_predictor_32x32_c, vpx_tm_predictor_32x32_c) #if HAVE_SSE2 && CONFIG_USE_X86INC -#if ARCH_X86_64 INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, vpx_dc_left_predictor_32x32_sse2, vpx_dc_top_predictor_32x32_sse2, vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2, vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL, vpx_tm_predictor_32x32_sse2) -#else -INTRA_PRED_TEST(SSE2, TestIntraPred32, vpx_dc_predictor_32x32_sse2, - vpx_dc_left_predictor_32x32_sse2, - vpx_dc_top_predictor_32x32_sse2, - vpx_dc_128_predictor_32x32_sse2, vpx_v_predictor_32x32_sse2, - vpx_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, - NULL, NULL) -#endif // ARCH_X86_64 #endif // HAVE_SSE2 && CONFIG_USE_X86INC #if HAVE_SSSE3 && CONFIG_USE_X86INC diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4d36e2796..798dbf124 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -241,7 +241,7 @@ add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, con specialize qw/vpx_v_predictor_32x32 neon msa/, "$sse2_x86inc"; add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc"; +specialize qw/vpx_tm_predictor_32x32 neon msa/, "$sse2_x86inc"; add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_dc_predictor_32x32 msa neon/, "$sse2_x86inc"; diff --git a/vpx_dsp/x86/intrapred_sse2.asm b/vpx_dsp/x86/intrapred_sse2.asm index f27b8d94e..151bbb37e 100644 --- a/vpx_dsp/x86/intrapred_sse2.asm +++ b/vpx_dsp/x86/intrapred_sse2.asm @@ -699,9 +699,8 @@ cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left jnz .loop REP_RET -%if ARCH_X86_64 INIT_XMM sse2 -cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left +cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left pxor m1, m1 movd m2, [aboveq-1] mova m0, [aboveq] @@ -722,31 +721,29 @@ cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left psubw m5, m2 .loop: movd m2, [leftq+lineq*2] - movd m6, [leftq+lineq*2+1] + pxor m1, m1 punpcklbw m2, m1 - punpcklbw m6, m1 + pshuflw m7, m2, 0x55 pshuflw m2, m2, 0x0 - pshuflw m6, m6, 0x0 punpcklqdq m2, m2 - punpcklqdq m6, m6 - paddw m7, m2, m0 - paddw m8, m2, m3 - paddw m9, m2, m4 - paddw m2, m5 - packuswb m7, m8 - packuswb m9, m2 - paddw m2, m6, m0 - paddw m8, m6, m3 - mova [dstq ], m7 - paddw m7, m6, m4 - paddw m6, m5 - mova [dstq +16], m9 - packuswb m2, m8 - packuswb m7, m6 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16], m7 + punpcklqdq m7, m7 + paddw m6, m2, m3 + paddw m1, m2, m0 + packuswb m1, m6 + mova [dstq ], m1 + paddw m6, m2, m5 + paddw m1, m2, m4 + packuswb m1, m6 + mova [dstq+16 ], m1 + paddw m6, m7, m3 + paddw m1, m7, m0 + packuswb m1, m6 + mova [dstq+strideq ], m1 + paddw m6, m7, m5 + paddw m1, m7, m4 + packuswb m1, m6 + mova [dstq+strideq+16], m1 lea dstq, [dstq+strideq*2] inc lineq jnz .loop REP_RET -%endif -- 2.49.0