From afd2f68daef62d1185ba1e65971fdd3a7fc1d8eb Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 4 Aug 2015 17:52:57 -0700 Subject: [PATCH] Revert "VP9_COPY_CONVOLVE_SSE2 optimization" This reverts commit a5e97d874b16ae5826b68515f1e35ffb44361cf8. Additionally: Revert "vpx_convolve_copy_sse2: fix win64" This reverts commit 22a8474fe7ec30d96f746dc6e4b23771758c071e. This change performs poorly on various x86_64 devices affecting performance by 1-3% at 1080P. Performance on chromebook like devices was mixed neutral to slightly negative, so there should be minimal change there. Change-Id: I95831233b4b84ee96369baa192a2d4cc7639658c --- vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 341 +++++++------------------ 1 file changed, 96 insertions(+), 245 deletions(-) diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index 061b34d3f..6cd620a59 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -16,289 +16,140 @@ SECTION .text %macro convolve_fn 1 INIT_XMM sse2 -cglobal convolve_%1, 4, 7, 8, src, src_stride, dst, dst_stride, \ +cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ fx, fxs, fy, fys, w, h - mov r4d, dword wm - cmp r4d, 4 + mov r4d, dword wm + cmp r4d, 4 je .w4 - cmp r4d, 8 + cmp r4d, 8 je .w8 - cmp r4d, 16 + cmp r4d, 16 je .w16 - cmp r4d, 32 + cmp r4d, 32 je .w32 - ; 64xh - mov r4d, dword hm - shr r4d, 1 ; ASSUMPTION: hm is at least EVEN - sub r4d, 1 - - movu m0, [srcq] - movu m4, [srcq+src_strideq] - movu m1, [srcq+16] - movu m5, [srcq+src_strideq+16] - movu m2, [srcq+32] - movu m6, [srcq+src_strideq+32] - movu m3, [srcq+48] - movu m7, [srcq+src_strideq+48] - + mov r4d, dword hm .loop64: - prefetcht0 [srcq+64 ] - prefetcht0 [srcq+src_strideq+64] - - lea srcq, [srcq+src_strideq*2] - + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+16] - - mova [dstq ], m0 - movu m0, [srcq] - - mova [dstq+16], m1 - movu m1, [srcq+16] - - pavgb m2, [dstq+32] - mova [dstq+32], m2 - movu m2, [srcq+32] - pavgb m3, [dstq+48] - mova [dstq+48], m3 - movu m3, [srcq+48] - pavgb m4, [dstq+dst_strideq] - - mova [dstq+dst_strideq], m4 - movu m4, [srcq+src_strideq] - - pavgb m5, [dstq+dst_strideq+16] - mova [dstq+dst_strideq+16], m5 - movu m5, [srcq+src_strideq+16] - pavgb m6, [dstq+dst_strideq+32] - mova [dstq+dst_strideq+32], m6 - movu m6, [srcq+src_strideq+32] - pavgb m7, [dstq+dst_strideq+48] - mova [dstq+dst_strideq+48], m7 - movu m7, [srcq+src_strideq+48] - - lea dstq, [dstq+dst_strideq*2] -%else - mova [dstq ], m0 - movu m0, [srcq] - - mova [dstq+16], m1 - movu m1, [srcq+16] - mova [dstq+32], m2 - movu m2, [srcq+32] - mova [dstq+48], m3 - movu m3, [srcq+48] - - mova [dstq+dst_strideq], m4 - movu m4, [srcq+src_strideq] - - mova [dstq+dst_strideq+16], m5 - movu m5, [srcq+src_strideq+16] - mova [dstq+dst_strideq+32], m6 - movu m6, [srcq+src_strideq+32] - mova [dstq+dst_strideq+48], m7 - movu m7, [srcq+src_strideq+48] - - lea dstq, [dstq+dst_strideq*2] + pavgb m0, [dstq] + pavgb m1, [dstq+16] + pavgb m2, [dstq+32] + pavgb m3, [dstq+48] %endif - dec r4d + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + dec r4d jnz .loop64 - -%ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+16] - pavgb m2, [dstq+32] - pavgb m3, [dstq+48] - pavgb m4, [dstq+dst_strideq] - pavgb m5, [dstq+dst_strideq+16] - pavgb m6, [dstq+dst_strideq+32] - pavgb m7, [dstq+dst_strideq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - - mova [dstq+dst_strideq ], m4 - mova [dstq+dst_strideq+16], m5 - mova [dstq+dst_strideq+32], m6 - mova [dstq+dst_strideq+48], m7 - RET .w32: - mov r4d, dword hm - sub r4d, 2 - - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+src_strideq] - movu m3, [srcq+src_strideq+16] - + mov r4d, dword hm .loop32: - prefetcht0 [srcq+64] - prefetcht0 [srcq+src_strideq+64] - - lea srcq, [srcq+src_strideq*2] + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+16] - pavgb m2, [dstq+dst_strideq] - pavgb m3, [dstq+dst_strideq+16] + pavgb m0, [dstq] + pavgb m1, [dstq +16] + pavgb m2, [dstq+dst_strideq] + pavgb m3, [dstq+dst_strideq+16] %endif - mova [dstq], m0 - movu m0, [srcq] - - mova [dstq+16], m1 - movu m1, [srcq+16] - - mova [dstq+dst_strideq], m2 - movu m2, [srcq+src_strideq] - - mova [dstq+dst_strideq+16], m3 - movu m3, [srcq+src_strideq+16] - - lea dstq, [dstq+dst_strideq*2] - - sub r4d, 2 + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 jnz .loop32 - -%ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+16] - pavgb m2, [dstq+dst_strideq] - pavgb m3, [dstq+dst_strideq+16] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - - mova [dstq+dst_strideq ], m2 - mova [dstq+dst_strideq+16], m3 - RET .w16: - mov r4d, dword hm - sub r4d, 4 - - movu m0, [srcq] - movu m1, [srcq+src_strideq] - + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] .loop16: - lea srcq, [srcq+src_strideq] - prefetcht0 [srcq+src_strideq*4] - lea srcq, [srcq+src_strideq] - prefetcht0 [srcq+src_strideq*2] + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] %endif - mova [dstq ], m0 - mova [dstq+dst_strideq], m1 - - lea dstq, [dstq+dst_strideq*2] - - movu m0, [srcq] - movu m1, [srcq+src_strideq] - - sub r4d, 2 + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 jnz .loop16 - - lea srcq, [srcq+src_strideq*2] -%ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] -%endif - mova [dstq ], m0 - mova [dstq+dst_strideq], m1 - - lea dstq, [dstq+dst_strideq*2] - - movu m0, [srcq] - movu m1, [srcq+src_strideq] - -%ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] -%endif - - mova [dstq ], m0 - mova [dstq+dst_strideq], m1 - RET INIT_MMX sse .w8: - mov r4d, dword hm - sub r4d, 2 - - movu m0, [srcq] - movu m1, [srcq+src_strideq] - + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] .loop8: - lea srcq, [srcq+src_strideq] - prefetcht0 [srcq+src_strideq*4] - lea srcq, [srcq+src_strideq] - prefetcht0 [srcq+src_strideq*2] - + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] + pavgb m2, [dstq+dst_strideq*2] + pavgb m3, [dstq+r6q] %endif - mova [dstq ], m0 - mova [dstq+dst_strideq], m1 - - movu m0, [srcq] - movu m1, [srcq+src_strideq] - - lea dstq, [dstq+dst_strideq*2] - - sub r4d, 2 + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 jnz .loop8 - -%ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] -%endif - mova [dstq ], m0 - mova [dstq+dst_strideq], m1 - RET .w4: - mov r4d, dword hm - - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] - + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] .loop4: - movh m0, [srcq] - movh m1, [srcq+src_strideq] - movh m2, [srcq+src_strideq*2] - movh m3, [srcq+r5q] - - lea srcq, [srcq+src_strideq*4] + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - movh m4, [dstq] - movh m5, [dstq+dst_strideq] - movh m6, [dstq+dst_strideq*2] - movh m7, [dstq+r6q] - - pavgb m0, m4 - pavgb m1, m5 - pavgb m2, m6 - pavgb m3, m7 + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavgb m0, m4 + pavgb m1, m5 + pavgb m2, m6 + pavgb m3, m7 %endif - movh [dstq ], m0 - movh [dstq+dst_strideq ], m1 - movh [dstq+dst_strideq*2], m2 - movh [dstq+r6q ], m3 - - lea dstq, [dstq+dst_strideq*4] - - sub r4d, 4 + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 jnz .loop4 RET %endmacro -- 2.40.0