From: Scott LaVarnway Date: Tue, 20 Mar 2012 20:32:54 +0000 (-0400) Subject: Updated vp8_build_intra_predictors_mbuv_s(sse2/ssse3) X-Git-Tag: v1.1.0~41^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ccea000c4bdc1691ba59f409393fc480a0c3d67e;p=libvpx Updated vp8_build_intra_predictors_mbuv_s(sse2/ssse3) to work with the latest code. Change-Id: Ie382bb55d00ea5929bdadba859eea15f696d4cd9 --- diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index fee896595..c069a21f1 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -126,7 +126,7 @@ prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned #TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride" -#TODO: fix assembly --- specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3 +specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3 prototype void vp8_intra4x4_predict "unsigned char *src, int src_stride, int b_mode, unsigned char *dst, int dst_stride" specialize vp8_intra4x4_predict media diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index 4b68ef5f2..d371ebd74 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -119,35 +119,39 @@ sym(vp8_copy_mem16x16_sse2): ;void vp8_intra_pred_uv_dc_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dc_mmx2) sym(vp8_intra_pred_uv_dc_mmx2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above; pxor mm0, mm0 movq mm1, [rsi] psadbw mm1, mm0 ; from left - dec rsi + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; lea rdi, [rax*3] - movzx ecx, byte [rsi+rax] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax*1] + add ecx, edx movzx edx, byte [rsi+rax*2] add ecx, edx + + movzx edx, byte [rsi+rdi] - add ecx, edx lea rsi, [rsi+rax*4] + add ecx, edx movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -156,8 +160,6 @@ sym(vp8_intra_pred_uv_dc_mmx2): add ecx, edx movzx edx, byte [rsi+rdi] add ecx, edx - movzx edx, byte [rsi+rax*4] - add ecx, edx ; add up pextrw edx, mm1, 0x0 @@ -192,23 +194,24 @@ sym(vp8_intra_pred_uv_dc_mmx2): ;void vp8_intra_pred_uv_dctop_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dctop_mmx2) sym(vp8_intra_pred_uv_dctop_mmx2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi ; end prolog + ;arg(3), arg(4) not used + ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above; pxor mm0, mm0 movq mm1, [rsi] psadbw mm1, mm0 @@ -245,22 +248,24 @@ sym(vp8_intra_pred_uv_dctop_mmx2): ;void vp8_intra_pred_uv_dcleft_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dcleft_mmx2) sym(vp8_intra_pred_uv_dcleft_mmx2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + ;arg(2) not used + ; from left - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - dec rsi + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; lea rdi, [rax*3] movzx ecx, byte [rsi] movzx edx, byte [rsi+rax] @@ -310,17 +315,20 @@ sym(vp8_intra_pred_uv_dcleft_mmx2): ;void vp8_intra_pred_uv_dc128_mmx( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_dc128_mmx) sym(vp8_intra_pred_uv_dc128_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx ; end prolog + ;arg(2), arg(3), arg(4) not used + ; write out movq mm1, [GLOBAL(dc_128)] mov rax, arg(0) ;dst; @@ -346,15 +354,16 @@ sym(vp8_intra_pred_uv_dc128_mmx): ;void vp8_intra_pred_uv_tm_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) %macro vp8_intra_pred_uv_tm 1 global sym(vp8_intra_pred_uv_tm_%1) sym(vp8_intra_pred_uv_tm_%1): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx push rsi push rdi @@ -362,9 +371,8 @@ sym(vp8_intra_pred_uv_tm_%1): ; read top row mov edx, 4 - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; pxor xmm0, xmm0 %ifidn %1, ssse3 movdqa xmm2, [GLOBAL(dc_1024)] @@ -374,7 +382,7 @@ sym(vp8_intra_pred_uv_tm_%1): ; set up left ptrs ans subtract topleft movd xmm3, [rsi-1] - lea rsi, [rsi+rax-1] + mov rsi, arg(3) ;left; %ifidn %1, sse2 punpcklbw xmm3, xmm0 pshuflw xmm3, xmm3, 0x0 @@ -427,20 +435,22 @@ vp8_intra_pred_uv_tm ssse3 ;void vp8_intra_pred_uv_ve_mmx( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_uv_ve_mmx) sym(vp8_intra_pred_uv_ve_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 ; end prolog + ; arg(3), arg(4) not used + ; read from top mov rax, arg(2) ;src; - movsxd rdx, dword ptr arg(3) ;src_stride; - sub rax, rdx + movq mm1, [rax] ; write out @@ -466,15 +476,16 @@ sym(vp8_intra_pred_uv_ve_mmx): ;void vp8_intra_pred_uv_ho_mmx2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) %macro vp8_intra_pred_uv_ho 1 global sym(vp8_intra_pred_uv_ho_%1) sym(vp8_intra_pred_uv_ho_%1): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi %ifidn %1, ssse3 @@ -485,12 +496,14 @@ sym(vp8_intra_pred_uv_ho_%1): %endif ; end prolog + ;arg(2) not used + ; read from left and write out %ifidn %1, mmx2 mov edx, 4 %endif - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; mov rdi, arg(0) ;dst; movsxd rcx, dword ptr arg(1) ;dst_stride %ifidn %1, ssse3 @@ -498,7 +511,7 @@ sym(vp8_intra_pred_uv_ho_%1): movdqa xmm2, [GLOBAL(dc_00001111)] lea rbx, [rax*3] %endif - dec rsi + %ifidn %1, mmx2 .vp8_intra_pred_uv_ho_%1_loop: movd mm0, [rsi] diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c index cb9ab80b9..949b2fb0e 100644 --- a/vp8/common/x86/recon_wrapper_sse2.c +++ b/vp8/common/x86/recon_wrapper_sse2.c @@ -15,7 +15,8 @@ #define build_intra_predictors_mbuv_prototype(sym) \ void sym(unsigned char *dst, int dst_stride, \ - const unsigned char *src, int src_stride) + const unsigned char *above, \ + const unsigned char *left, int left_stride) typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t)); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2); @@ -29,15 +30,19 @@ extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2); extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3); static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, unsigned char *dst_u, unsigned char *dst_v, int dst_stride, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, build_intra_predictors_mbuv_fn_t tm_func, build_intra_predictors_mbuv_fn_t ho_func) { int mode = x->mode_info_context->mbmi.uv_mode; build_intra_predictors_mbuv_fn_t fn; - int src_stride = x->dst.uv_stride; switch (mode) { case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break; @@ -59,38 +64,48 @@ static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x, default: return; } - fn(dst_u, dst_stride, x->dst.u_buffer, src_stride); - fn(dst_v, dst_stride, x->dst.v_buffer, src_stride); + fn(dst_u, dst_stride, uabove_row, uleft, left_stride); + fn(dst_v, dst_stride, vabove_row, vleft, left_stride); } -void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x) +void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) { - vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256], - &x->predictor[320], 8, + vp8_build_intra_predictors_mbuv_x86(x, + uabove_row, vabove_row, + upred_ptr, + vpred_ptr, pred_stride, + uleft, + vleft, + left_stride, vp8_intra_pred_uv_tm_sse2, vp8_intra_pred_uv_ho_mmx2); } -void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x) +void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x, + unsigned char * uabove_row, + unsigned char * vabove_row, + unsigned char * uleft, + unsigned char * vleft, + int left_stride, + unsigned char * upred_ptr, + unsigned char * vpred_ptr, + int pred_stride) { - vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256], - &x->predictor[320], 8, - vp8_intra_pred_uv_tm_ssse3, - vp8_intra_pred_uv_ho_ssse3); -} - -void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x) -{ - vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer, - x->dst.v_buffer, x->dst.uv_stride, - vp8_intra_pred_uv_tm_sse2, - vp8_intra_pred_uv_ho_mmx2); -} - -void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x) -{ - vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer, - x->dst.v_buffer, x->dst.uv_stride, + vp8_build_intra_predictors_mbuv_x86(x, + uabove_row, vabove_row, + upred_ptr, + vpred_ptr, pred_stride, + uleft, + vleft, + left_stride, vp8_intra_pred_uv_tm_ssse3, vp8_intra_pred_uv_ho_ssse3); } @@ -132,22 +147,10 @@ static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x, default: return; } - fn(dst_y, dst_stride, x->dst.y_buffer, src_stride); +// fn(dst_y, dst_stride, x->dst.y_buffer, src_stride); return; } -void vp8_build_intra_predictors_mby_sse2(MACROBLOCKD *x) -{ - vp8_build_intra_predictors_mby_x86(x, x->predictor, 16, - vp8_intra_pred_y_tm_sse2); -} - -void vp8_build_intra_predictors_mby_ssse3(MACROBLOCKD *x) -{ - vp8_build_intra_predictors_mby_x86(x, x->predictor, 16, - vp8_intra_pred_y_tm_ssse3); -} - void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x) { vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride,