From: Scott LaVarnway Date: Thu, 29 Mar 2012 18:24:53 +0000 (-0400) Subject: Updated vp8_build_intra_predictors_mby_s(sse2/ssse3) X-Git-Tag: v1.1.0~40 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a337725625207758e56c1820c3a06bc00ca22b1a;p=libvpx Updated vp8_build_intra_predictors_mby_s(sse2/ssse3) to work with the latest code. Patch Set 2: aligned the above_row buffers to fix crash Change-Id: I7a6992a20ed079ccd302f8c26215cf3057f8b70c --- diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c index f1cd12fc1..4067a6851 100644 --- a/vp8/common/reconintra.c +++ b/vp8/common/reconintra.c @@ -14,7 +14,7 @@ #include "vpx_mem/vpx_mem.h" #include "blockd.h" -void vp8_build_intra_predictors_mby_s(MACROBLOCKD *x, +void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index c069a21f1..ab9951528 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -123,7 +123,8 @@ specialize vp8_copy_mem8x4 mmx media neon vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6 prototype void vp8_build_intra_predictors_mby_s "struct macroblockd *x, unsigned char * yabove_row, unsigned char * yleft, int left_stride, unsigned char * ypred_ptr, int y_stride" -#TODO: fix assembly --- specialize vp8_build_intra_predictors_mby_s sse2 ssse3 neon +specialize vp8_build_intra_predictors_mby_s sse2 ssse3 +#TODO: fix assembly for neon prototype void vp8_build_intra_predictors_mbuv_s "struct macroblockd *x, unsigned char * uabove_row, unsigned char * vabove_row, unsigned char *uleft, unsigned char *vleft, int left_stride, unsigned char * upred_ptr, unsigned char * vpred_ptr, int pred_stride" specialize vp8_build_intra_predictors_mbuv_s sse2 ssse3 diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index d371ebd74..7b6e3cffe 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -133,22 +133,20 @@ sym(vp8_intra_pred_uv_dc_mmx2): ; end prolog ; from top - mov rsi, arg(2) ;above; - pxor mm0, mm0 - movq mm1, [rsi] - psadbw mm1, mm0 - - ; from left + mov rdi, arg(2) ;above; mov rsi, arg(3) ;left; movsxd rax, dword ptr arg(4) ;left_stride; + pxor mm0, mm0 + movq mm1, [rdi] lea rdi, [rax*3] + psadbw mm1, mm0 + ; from left movzx ecx, byte [rsi] movzx edx, byte [rsi+rax*1] add ecx, edx movzx edx, byte [rsi+rax*2] add ecx, edx - movzx edx, byte [rsi+rdi] lea rsi, [rsi+rax*4] add ecx, edx @@ -166,23 +164,23 @@ sym(vp8_intra_pred_uv_dc_mmx2): lea edx, [edx+ecx+8] sar edx, 4 movd mm1, edx + movsxd rcx, dword ptr arg(1) ;dst_stride pshufw mm1, mm1, 0x0 + mov rdi, arg(0) ;dst; packuswb mm1, mm1 ; write out - mov rdi, arg(0) ;dst; - movsxd rcx, dword ptr arg(1) ;dst_stride lea rax, [rcx*3] + lea rdx, [rdi+rcx*4] movq [rdi ], mm1 movq [rdi+rcx ], mm1 movq [rdi+rcx*2], mm1 movq [rdi+rax ], mm1 - lea rdi, [rdi+rcx*4] - movq [rdi ], mm1 - movq [rdi+rcx ], mm1 - movq [rdi+rcx*2], mm1 - movq [rdi+rax ], mm1 + movq [rdx ], mm1 + movq [rdx+rcx ], mm1 + movq [rdx+rcx*2], mm1 + movq [rdx+rax ], mm1 ; begin epilog pop rdi @@ -478,7 +476,7 @@ sym(vp8_intra_pred_uv_ve_mmx): ; int dst_stride ; unsigned char *above, ; unsigned char *left, -; int left_stride, +; int left_stride ; ) %macro vp8_intra_pred_uv_ho 1 global sym(vp8_intra_pred_uv_ho_%1) @@ -575,38 +573,43 @@ vp8_intra_pred_uv_ho ssse3 ;void vp8_intra_pred_y_dc_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dc_sse2) sym(vp8_intra_pred_y_dc_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog ; from top - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rdi, arg(2) ;above + mov rsi, arg(3) ;left + movsxd rax, dword ptr arg(4) ;left_stride; + pxor xmm0, xmm0 - movdqa xmm1, [rsi] + movdqa xmm1, [rdi] psadbw xmm1, xmm0 movq xmm2, xmm1 punpckhqdq xmm1, xmm1 paddw xmm1, xmm2 ; from left - dec rsi lea rdi, [rax*3] - movzx ecx, byte [rsi+rax] + + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx movzx edx, byte [rsi+rax*2] add ecx, edx movzx edx, byte [rsi+rdi] add ecx, edx lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -616,6 +619,7 @@ sym(vp8_intra_pred_y_dc_sse2): movzx edx, byte [rsi+rdi] add ecx, edx lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -625,6 +629,7 @@ sym(vp8_intra_pred_y_dc_sse2): movzx edx, byte [rsi+rdi] add ecx, edx lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] add ecx, edx movzx edx, byte [rsi+rax] @@ -633,8 +638,6 @@ sym(vp8_intra_pred_y_dc_sse2): add ecx, edx movzx edx, byte [rsi+rdi] add ecx, edx - movzx edx, byte [rsi+rax*4] - add ecx, edx ; add up pextrw edx, xmm1, 0x0 @@ -676,22 +679,23 @@ sym(vp8_intra_pred_y_dc_sse2): ;void vp8_intra_pred_y_dctop_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dctop_sse2) sym(vp8_intra_pred_y_dctop_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi GET_GOT rbx ; end prolog + ;arg(3), arg(4) not used + ; from top - mov rcx, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rcx, rax + mov rcx, arg(2) ;above; pxor xmm0, xmm0 movdqa xmm1, [rcx] psadbw xmm1, xmm0 @@ -737,22 +741,25 @@ sym(vp8_intra_pred_y_dctop_sse2): ;void vp8_intra_pred_y_dcleft_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dcleft_sse2) sym(vp8_intra_pred_y_dcleft_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + ;arg(2) not used + ; from left - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - dec rsi + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; + lea rdi, [rax*3] movzx ecx, byte [rsi] movzx edx, byte [rsi+rax] @@ -827,18 +834,21 @@ sym(vp8_intra_pred_y_dcleft_sse2): ;void vp8_intra_pred_y_dc128_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_dc128_sse2) sym(vp8_intra_pred_y_dc128_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi GET_GOT rbx ; end prolog + ;arg(2), arg(3), arg(4) not used + ; write out mov rsi, 2 movdqa xmm1, [GLOBAL(dc_128)] @@ -870,15 +880,16 @@ sym(vp8_intra_pred_y_dc128_sse2): ;void vp8_intra_pred_y_tm_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) %macro vp8_intra_pred_y_tm 1 global sym(vp8_intra_pred_y_tm_%1) sym(vp8_intra_pred_y_tm_%1): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi GET_GOT rbx @@ -886,9 +897,8 @@ sym(vp8_intra_pred_y_tm_%1): ; read top row mov edx, 8 - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; - sub rsi, rax + mov rsi, arg(2) ;above + movsxd rax, dword ptr arg(4) ;left_stride; pxor xmm0, xmm0 %ifidn %1, ssse3 movdqa xmm3, [GLOBAL(dc_1024)] @@ -900,7 +910,7 @@ sym(vp8_intra_pred_y_tm_%1): ; set up left ptrs ans subtract topleft movd xmm4, [rsi-1] - lea rsi, [rsi+rax-1] + mov rsi, arg(3) ;left %ifidn %1, sse2 punpcklbw xmm4, xmm0 pshuflw xmm4, xmm4, 0x0 @@ -958,27 +968,29 @@ vp8_intra_pred_y_tm ssse3 ;void vp8_intra_pred_y_ve_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride ; ) global sym(vp8_intra_pred_y_ve_sse2) sym(vp8_intra_pred_y_ve_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi ; end prolog + ;arg(3), arg(4) not used + + mov rax, arg(2) ;above; + mov rsi, 2 + movsxd rdx, dword ptr arg(1) ;dst_stride + ; read from top - mov rax, arg(2) ;src; - movsxd rdx, dword ptr arg(3) ;src_stride; - sub rax, rdx movdqa xmm1, [rax] ; write out - mov rsi, 2 mov rax, arg(0) ;dst; - movsxd rdx, dword ptr arg(1) ;dst_stride lea rcx, [rdx*3] .label @@ -1004,25 +1016,27 @@ sym(vp8_intra_pred_y_ve_sse2): ;void vp8_intra_pred_y_ho_sse2( ; unsigned char *dst, ; int dst_stride -; unsigned char *src, -; int src_stride, +; unsigned char *above, +; unsigned char *left, +; int left_stride, ; ) global sym(vp8_intra_pred_y_ho_sse2) sym(vp8_intra_pred_y_ho_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 + SHADOW_ARGS_TO_STACK 5 push rsi push rdi ; end prolog + ;arg(2) not used + ; read from left and write out mov edx, 8 - mov rsi, arg(2) ;src; - movsxd rax, dword ptr arg(3) ;src_stride; + mov rsi, arg(3) ;left; + movsxd rax, dword ptr arg(4) ;left_stride; mov rdi, arg(0) ;dst; movsxd rcx, dword ptr arg(1) ;dst_stride - dec rsi vp8_intra_pred_y_ho_sse2_loop: movd xmm0, [rsi] diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c index 949b2fb0e..b482faa3f 100644 --- a/vp8/common/x86/recon_wrapper_sse2.c +++ b/vp8/common/x86/recon_wrapper_sse2.c @@ -110,23 +110,32 @@ void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x, vp8_intra_pred_uv_ho_ssse3); } -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc_sse2); -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dctop_sse2); -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dcleft_sse2); -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_dc128_sse2); -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ho_sse2); -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_ve_sse2); -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_sse2); -extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_y_tm_ssse3); +#define build_intra_predictors_mby_prototype(sym) \ + void sym(unsigned char *dst, int dst_stride, \ + const unsigned char *above, \ + const unsigned char *left, int left_stride) +typedef build_intra_predictors_mby_prototype((*build_intra_predictors_mby_fn_t)); + +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dctop_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dcleft_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_dc128_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ho_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_ve_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_sse2); +extern build_intra_predictors_mby_prototype(vp8_intra_pred_y_tm_ssse3); static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x, + unsigned char * yabove_row, unsigned char *dst_y, int dst_stride, - build_intra_predictors_mbuv_fn_t tm_func) + unsigned char * yleft, + int left_stride, + build_intra_predictors_mby_fn_t tm_func) { int mode = x->mode_info_context->mbmi.mode; build_intra_predictors_mbuv_fn_t fn; - int src_stride = x->dst.y_stride; + switch (mode) { case V_PRED: fn = vp8_intra_pred_y_ve_sse2; break; case H_PRED: fn = vp8_intra_pred_y_ho_sse2; break; @@ -147,19 +156,31 @@ static void vp8_build_intra_predictors_mby_x86(MACROBLOCKD *x, default: return; } -// fn(dst_y, dst_stride, x->dst.y_buffer, src_stride); + fn(dst_y, dst_stride, yabove_row, yleft, left_stride); return; } -void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x) +void vp8_build_intra_predictors_mby_s_sse2(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) { - vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride, + vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr, + y_stride, yleft, left_stride, vp8_intra_pred_y_tm_sse2); } -void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x) +void vp8_build_intra_predictors_mby_s_ssse3(MACROBLOCKD *x, + unsigned char * yabove_row, + unsigned char * yleft, + int left_stride, + unsigned char * ypred_ptr, + int y_stride) { - vp8_build_intra_predictors_mby_x86(x, x->dst.y_buffer, x->dst.y_stride, + vp8_build_intra_predictors_mby_x86(x, yabove_row, ypred_ptr, + y_stride, yleft, left_stride, vp8_intra_pred_y_tm_ssse3); } diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 20c564b32..845228bb5 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -815,15 +815,15 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) /* Allocate memory for above_row buffers. */ CHECK_MEM_ERROR(pbi->mt_yabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows)); for (i=0; i< pc->mb_rows; i++) - CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_calloc(sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1)), 1)); + CHECK_MEM_ERROR(pbi->mt_yabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (width + (VP8BORDERINPIXELS<<1)))); CHECK_MEM_ERROR(pbi->mt_uabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows)); for (i=0; i< pc->mb_rows; i++) - CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1)); + CHECK_MEM_ERROR(pbi->mt_uabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); CHECK_MEM_ERROR(pbi->mt_vabove_row, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows)); for (i=0; i< pc->mb_rows; i++) - CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_calloc(sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS), 1)); + CHECK_MEM_ERROR(pbi->mt_vabove_row[i], vpx_memalign(16,sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); /* Allocate memory for left_col buffers. */ CHECK_MEM_ERROR(pbi->mt_yleft_col, vpx_malloc(sizeof(unsigned char *) * pc->mb_rows));