From: Ronald S. Bultje Date: Fri, 29 Apr 2011 18:51:37 +0000 (-0700) Subject: Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3. X-Git-Tag: v0.9.7~147^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=5a23352c030d2b190976ea55a9a759c734bd9eaa;p=libvpx Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3. Change-Id: I658a1df7d825f820573cb2d11ad402f9d2791035 --- diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index 97dc4f686..aaa6a8fb9 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -578,23 +578,35 @@ sym(vp8_intra_pred_uv_ve_mmx): ; unsigned char *src, ; int src_stride, ; ) -global sym(vp8_intra_pred_uv_ho_mmx2) -sym(vp8_intra_pred_uv_ho_mmx2): +%macro vp8_intra_pred_uv_ho 1 +global sym(vp8_intra_pred_uv_ho_%1) +sym(vp8_intra_pred_uv_ho_%1): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 push rsi push rdi +%ifidn %1, ssse3 + push rbx +%endif ; end prolog ; read from left and write out +%ifidn %1, mmx2 mov edx, 4 +%endif mov rsi, arg(2) ;src; movsxd rax, dword ptr arg(3) ;src_stride; mov rdi, arg(0) ;dst; movsxd rcx, dword ptr arg(1) ;dst_stride +%ifidn %1, ssse3 + lea rbx, [rax*3] + lea rdx, [rcx*3] + movdqa xmm2, [GLOBAL(dc_00001111)] +%endif dec rsi -vp8_intra_pred_uv_ho_mmx2_loop: +%ifidn %1, mmx2 +vp8_intra_pred_uv_ho_%1_loop: movd mm0, [rsi] movd mm1, [rsi+rax] punpcklbw mm0, mm0 @@ -606,14 +618,49 @@ vp8_intra_pred_uv_ho_mmx2_loop: lea rsi, [rsi+rax*2] lea rdi, [rdi+rcx*2] dec edx - jnz vp8_intra_pred_uv_ho_mmx2_loop + jnz vp8_intra_pred_uv_ho_%1_loop +%else + movd xmm0, [rsi] + movd xmm3, [rsi+rax] + movd xmm1, [rsi+rax*2] + movd xmm4, [rsi+rbx] + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 + lea rsi, [rsi+rax*4] + lea rdi, [rdi+rcx*4] + movd xmm0, [rsi] + movd xmm3, [rsi+rax] + movd xmm1, [rsi+rax*2] + movd xmm4, [rsi+rbx] + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 +%endif ; begin epilog +%ifidn %1, ssse3 + pop rbx +%endif pop rdi pop rsi UNSHADOW_ARGS pop rbp ret +%endmacro + +vp8_intra_pred_uv_ho mmx2 +vp8_intra_pred_uv_ho ssse3 SECTION_RODATA dc_128: @@ -623,3 +670,7 @@ dc_4: align 16 dc_1024: times 8 dw 0x400 +align 16 +dc_00001111: + times 8 db 0 + times 8 db 1 diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c index 86b4da2c2..cb7b69c08 100644 --- a/vp8/common/x86/recon_wrapper_sse2.c +++ b/vp8/common/x86/recon_wrapper_sse2.c @@ -23,6 +23,7 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_ssse3); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3); @@ -31,7 +32,8 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x, unsigned char *dst_u, unsigned char *dst_v, int dst_stride, - build_intra_predictors_mbuv_fn_t tm_func) + build_intra_predictors_mbuv_fn_t tm_func, + build_intra_predictors_mbuv_fn_t ho_func) { int mode = x->mode_info_context->mbmi.uv_mode; build_intra_predictors_mbuv_fn_t fn; @@ -39,7 +41,7 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x, switch (mode) { case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break; - case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break; + case H_PRED: fn = ho_func; break; case TM_PRED: fn = tm_func; break; case DC_PRED: if (x->up_available) { @@ -65,26 +67,30 @@ void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x) { vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256], &x->predictor[320], 8, - vp8_intra_pred_uv_tm_sse2); + vp8_intra_pred_uv_tm_sse2, + vp8_intra_pred_uv_ho_mmx2); } void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x) { vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256], &x->predictor[320], 8, - vp8_intra_pred_uv_tm_ssse3); + vp8_intra_pred_uv_tm_ssse3, + vp8_intra_pred_uv_ho_ssse3); } void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x) { vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer, x->dst.v_buffer, x->dst.uv_stride, - vp8_intra_pred_uv_tm_sse2); + vp8_intra_pred_uv_tm_sse2, + vp8_intra_pred_uv_ho_mmx2); } void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x) { vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer, x->dst.v_buffer, x->dst.uv_stride, - vp8_intra_pred_uv_tm_ssse3); + vp8_intra_pred_uv_tm_ssse3, + vp8_intra_pred_uv_ho_ssse3); }