From 85358d04cded41b03c1ee6912f2fb98b357de192 Mon Sep 17 00:00:00 2001 From: Johann Date: Tue, 23 Aug 2011 20:42:45 -0400 Subject: [PATCH] Fix data accesses for simple loopfilters The data that the simple horizontal loopfilter reads is aligned, treat it accordingly. For the vertical, we only use the bottom 4 bytes, so don't read in 16 (and incur the penalty for unaligned access). This shows a small improvement on older processors which have a significant penalty for unaligned reads. postproc_mmx.c is unused Change-Id: I87b29bbc0c3b19ee1ca1de3c4f47332a53087b3d --- vp8/common/x86/loopfilter_sse2.asm | 44 +- vp8/common/x86/postproc_mmx.c | 1508 ---------------------------- 2 files changed, 22 insertions(+), 1530 deletions(-) delete mode 100644 vp8/common/x86/postproc_mmx.c diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 4efff7eb5..295609c58 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -1395,8 +1395,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): neg rax ; calculate mask - movdqu xmm1, [rsi+2*rax] ; p1 - movdqu xmm0, [rdi] ; q1 + movdqa xmm1, [rsi+2*rax] ; p1 + movdqa xmm0, [rdi] ; q1 movdqa xmm2, xmm1 movdqa xmm7, xmm0 movdqa xmm4, xmm0 @@ -1406,8 +1406,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero psrlw xmm1, 1 ; abs(p1-q1)/2 - movdqu xmm5, [rsi+rax] ; p0 - movdqu xmm4, [rsi] ; q0 + movdqa xmm5, [rsi+rax] ; p0 + movdqa xmm4, [rsi] ; q0 movdqa xmm0, xmm4 ; q0 movdqa xmm6, xmm5 ; p0 psubusb xmm5, xmm4 ; p0-=q0 @@ -1449,7 +1449,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): psubsb xmm3, xmm0 ; q0-= q0 add pxor xmm3, [GLOBAL(t80)] ; unoffset - movdqu [rsi], xmm3 ; write back + movdqa [rsi], xmm3 ; write back ; now do +3 side psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 @@ -1465,7 +1465,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): paddsb xmm6, xmm0 ; p0+= p0 add pxor xmm6, [GLOBAL(t80)] ; unoffset - movdqu [rsi+rax], xmm6 ; write back + movdqa [rsi+rax], xmm6 ; write back ; begin epilog pop rdi @@ -1507,17 +1507,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): lea rdx, [rsi + rax*4] lea rcx, [rdx + rax] - movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 - movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 - movdqu xmm2, [rdi] ; 13 12 11 10 - movdqu xmm3, [rcx] ; 53 52 51 50 + movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movd xmm2, [rdi] ; 13 12 11 10 + movd xmm3, [rcx] ; 53 52 51 50 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 - movdqu xmm4, [rsi + rax*2] ; 23 22 21 20 - movdqu xmm5, [rdx + rax*2] ; 63 62 61 60 - movdqu xmm6, [rdi + rax*2] ; 33 32 31 30 - movdqu xmm7, [rcx + rax*2] ; 73 72 71 70 + movd xmm4, [rsi + rax*2] ; 23 22 21 20 + movd xmm5, [rdx + rax*2] ; 63 62 61 60 + movd xmm6, [rdi + rax*2] ; 33 32 31 30 + movd xmm7, [rcx + rax*2] ; 73 72 71 70 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 @@ -1540,17 +1540,17 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): lea rdx, [rsi + rax*4] lea rcx, [rdx + rax] - movdqu xmm4, [rsi] ; 83 82 81 80 - movdqu xmm1, [rdx] ; c3 c2 c1 c0 - movdqu xmm6, [rdi] ; 93 92 91 90 - movdqu xmm3, [rcx] ; d3 d2 d1 d0 + movd xmm4, [rsi] ; 83 82 81 80 + movd xmm1, [rdx] ; c3 c2 c1 c0 + movd xmm6, [rdi] ; 93 92 91 90 + movd xmm3, [rcx] ; d3 d2 d1 d0 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 - movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0 - movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0 - movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0 - movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 diff --git a/vp8/common/x86/postproc_mmx.c b/vp8/common/x86/postproc_mmx.c deleted file mode 100644 index 6b6321ace..000000000 --- a/vp8/common/x86/postproc_mmx.c +++ /dev/null @@ -1,1508 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include -#include "vpx_scale/yv12config.h" -#include "pragmas.h" - -#define VP8_FILTER_WEIGHT 128 -#define VP8_FILTER_SHIFT 7 - - - -/* static constants */ -__declspec(align(16)) -const static short Blur[48] = -{ - - 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, - 64, 64, 64, 64, 64, 64, 64, 64, - 16, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 16, 16, 16, 16, 16, 16, - 0, 0, 0, 0, 0, 0, 0, 0, - -}; -#define RD __declspec(align(16)) __int64 rd = 0x0040004000400040; -#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004}; - -#ifndef RELOCATEABLE -const static RD; -const static R4D2; -#endif - - -/* external references */ -extern double vp8_gaussian(double sigma, double mu, double x); -extern short vp8_rv[]; -extern int vp8_q2mbl(int x) ; - - - -void vp8_post_proc_down_and_across_mmx -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -) -{ -#ifdef RELOCATEABLE - RD - R4D2 -#endif - - __asm - { - push ebx - lea ebx, Blur - movd mm2, flimit - punpcklwd mm2, mm2 - punpckldq mm2, mm2 - - mov esi, src_ptr - mov edi, dst_ptr - - mov ecx, DWORD PTR rows - mov eax, src_pixels_per_line ; - destination pitch? - pxor mm0, mm0 ; - mm0 = 00000000 - - nextrow: - - xor edx, edx ; - - clear out edx for use as loop counter - nextcol: - - pxor mm7, mm7 ; - - mm7 = 00000000 - movq mm6, [ebx + 32 ] ; - mm6 = kernel 2 taps - movq mm3, [esi] ; - mm4 = r0 p0..p7 - punpcklbw mm3, mm0 ; - mm3 = p0..p3 - movq mm1, mm3 ; - mm1 = p0..p3 - pmullw mm3, mm6 ; - mm3 *= kernel 2 modifiers - - movq mm6, [ebx + 48] ; - mm6 = kernel 3 taps - movq mm5, [esi + eax] ; - mm4 = r1 p0..p7 - punpcklbw mm5, mm0 ; - mm5 = r1 p0..p3 - pmullw mm6, mm5 ; - mm6 *= p0..p3 * kernel 3 modifiers - paddusw mm3, mm6 ; - mm3 += mm6 - - ; - thresholding - movq mm7, mm1 ; - mm7 = r0 p0..p3 - psubusw mm7, mm5 ; - mm7 = r0 p0..p3 - r1 p0..p3 - psubusw mm5, mm1 ; - mm5 = r1 p0..p3 - r0 p0..p3 - paddusw mm7, mm5 ; - mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw mm7, mm2 - - movq mm6, [ebx + 64 ] ; - mm6 = kernel 4 modifiers - movq mm5, [esi + 2*eax] ; - mm4 = r2 p0..p7 - punpcklbw mm5, mm0 ; - mm5 = r2 p0..p3 - pmullw mm6, mm5 ; - mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = r0 p0..p3 - psubusw mm6, mm5 ; - mm6 = r0 p0..p3 - r2 p0..p3 - psubusw mm5, mm1 ; - mm5 = r2 p0..p3 - r2 p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - - neg eax - movq mm6, [ebx ] ; - kernel 0 taps - movq mm5, [esi+2*eax] ; - mm4 = r-2 p0..p7 - punpcklbw mm5, mm0 ; - mm5 = r-2 p0..p3 - pmullw mm6, mm5 ; - mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = r0 p0..p3 - psubusw mm6, mm5 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw mm5, mm1 ; - mm5 = r-2 p0..p3 - p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - movq mm6, [ebx + 16] ; - kernel 1 taps - movq mm4, [esi+eax] ; - mm4 = r-1 p0..p7 - punpcklbw mm4, mm0 ; - mm4 = r-1 p0..p3 - pmullw mm6, mm4 ; - mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = r0 p0..p3 - psubusw mm6, mm4 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw mm4, mm1 ; - mm5 = r-1 p0..p3 - p0..p3 - paddusw mm6, mm4 ; - mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - - paddusw mm3, rd ; - mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; - mm3 /= 128 - - pand mm1, mm7 ; - mm1 select vals > thresh from source - pandn mm7, mm3 ; - mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; - combination - - packuswb mm1, mm0 ; - pack to bytes - - movd [edi], mm1 ; - neg eax ; - pitch is positive - - - add esi, 4 - add edi, 4 - add edx, 4 - - cmp edx, cols - jl nextcol - // done with the all cols, start the across filtering in place - sub esi, edx - sub edi, edx - - - push eax - xor edx, edx - mov eax, [edi-4]; - - acrossnextcol: - pxor mm7, mm7 ; - mm7 = 00000000 - movq mm6, [ebx + 32 ] ; - movq mm4, [edi+edx] ; - mm4 = p0..p7 - movq mm3, mm4 ; - mm3 = p0..p7 - punpcklbw mm3, mm0 ; - mm3 = p0..p3 - movq mm1, mm3 ; - mm1 = p0..p3 - pmullw mm3, mm6 ; - mm3 *= kernel 2 modifiers - - movq mm6, [ebx + 48] - psrlq mm4, 8 ; - mm4 = p1..p7 - movq mm5, mm4 ; - mm5 = p1..p7 - punpcklbw mm5, mm0 ; - mm5 = p1..p4 - pmullw mm6, mm5 ; - mm6 *= p1..p4 * kernel 3 modifiers - paddusw mm3, mm6 ; - mm3 += mm6 - - ; - thresholding - movq mm7, mm1 ; - mm7 = p0..p3 - psubusw mm7, mm5 ; - mm7 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm7, mm5 ; - mm7 = abs(p0..p3 - p1..p4) - pcmpgtw mm7, mm2 - - movq mm6, [ebx + 64 ] - psrlq mm4, 8 ; - mm4 = p2..p7 - movq mm5, mm4 ; - mm5 = p2..p7 - punpcklbw mm5, mm0 ; - mm5 = p2..p5 - pmullw mm6, mm5 ; - mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = p0..p3 - psubusw mm6, mm5 ; - mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - - movq mm6, [ebx ] - movq mm4, [edi+edx-2] ; - mm4 = p-2..p5 - movq mm5, mm4 ; - mm5 = p-2..p5 - punpcklbw mm5, mm0 ; - mm5 = p-2..p1 - pmullw mm6, mm5 ; - mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = p0..p3 - psubusw mm6, mm5 ; - mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - movq mm6, [ebx + 16] - psrlq mm4, 8 ; - mm4 = p-1..p5 - punpcklbw mm4, mm0 ; - mm4 = p-1..p2 - pmullw mm6, mm4 ; - mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; - mm3 += mm5 - - ; - thresholding - movq mm6, mm1 ; - mm6 = p0..p3 - psubusw mm6, mm4 ; - mm6 = p0..p3 - p1..p4 - psubusw mm4, mm1 ; - mm5 = p1..p4 - p0..p3 - paddusw mm6, mm4 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; - accumulate thresholds - - paddusw mm3, rd ; - mm3 += round value - psraw mm3, VP8_FILTER_SHIFT ; - mm3 /= 128 - - pand mm1, mm7 ; - mm1 select vals > thresh from source - pandn mm7, mm3 ; - mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; - combination - - packuswb mm1, mm0 ; - pack to bytes - mov DWORD PTR [edi+edx-4], eax ; - store previous four bytes - movd eax, mm1 - - add edx, 4 - cmp edx, cols - jl acrossnextcol; - - mov DWORD PTR [edi+edx-4], eax - pop eax - - // done with this rwo - add esi, eax ; - next line - mov eax, dst_pixels_per_line ; - destination pitch? - add edi, eax ; - next destination - mov eax, src_pixels_per_line ; - destination pitch? - - dec ecx ; - decrement count - jnz nextrow ; - next row - pop ebx - - } -} - - - -void vp8_post_proc_down_and_across_xmm -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -) -{ -#ifdef RELOCATEABLE - R4D2 -#endif - - __asm - { - movd xmm2, flimit - punpcklwd xmm2, xmm2 - punpckldq xmm2, xmm2 - punpcklqdq xmm2, xmm2 - - mov esi, src_ptr - mov edi, dst_ptr - - mov ecx, DWORD PTR rows - mov eax, src_pixels_per_line ; - destination pitch? - pxor xmm0, xmm0 ; - mm0 = 00000000 - - nextrow: - - xor edx, edx ; - - clear out edx for use as loop counter - nextcol: - movq xmm3, QWORD PTR [esi] ; - - mm4 = r0 p0..p7 - punpcklbw xmm3, xmm0 ; - mm3 = p0..p3 - movdqa xmm1, xmm3 ; - mm1 = p0..p3 - psllw xmm3, 2 ; - - movq xmm5, QWORD PTR [esi + eax] ; - mm4 = r1 p0..p7 - punpcklbw xmm5, xmm0 ; - mm5 = r1 p0..p3 - paddusw xmm3, xmm5 ; - mm3 += mm6 - - ; - thresholding - movdqa xmm7, xmm1 ; - mm7 = r0 p0..p3 - psubusw xmm7, xmm5 ; - mm7 = r0 p0..p3 - r1 p0..p3 - psubusw xmm5, xmm1 ; - mm5 = r1 p0..p3 - r0 p0..p3 - paddusw xmm7, xmm5 ; - mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw xmm7, xmm2 - - movq xmm5, QWORD PTR [esi + 2*eax] ; - mm4 = r2 p0..p7 - punpcklbw xmm5, xmm0 ; - mm5 = r2 p0..p3 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; - mm6 = r0 p0..p3 - r2 p0..p3 - psubusw xmm5, xmm1 ; - mm5 = r2 p0..p3 - r2 p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - - neg eax - movq xmm5, QWORD PTR [esi+2*eax] ; - mm4 = r-2 p0..p7 - punpcklbw xmm5, xmm0 ; - mm5 = r-2 p0..p3 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = r0 p0..p3 - psubusw xmm6, xmm5 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm5, xmm1 ; - mm5 = r-2 p0..p3 - p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - movq xmm4, QWORD PTR [esi+eax] ; - mm4 = r-1 p0..p7 - punpcklbw xmm4, xmm0 ; - mm4 = r-1 p0..p3 - paddusw xmm3, xmm4 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = r0 p0..p3 - psubusw xmm6, xmm4 ; - mm6 = p0..p3 - r-2 p0..p3 - psubusw xmm4, xmm1 ; - mm5 = r-1 p0..p3 - p0..p3 - paddusw xmm6, xmm4 ; - mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - - paddusw xmm3, rd42 ; - mm3 += round value - psraw xmm3, 3 ; - mm3 /= 8 - - pand xmm1, xmm7 ; - mm1 select vals > thresh from source - pandn xmm7, xmm3 ; - mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; - combination - - packuswb xmm1, xmm0 ; - pack to bytes - movq QWORD PTR [edi], xmm1 ; - - neg eax ; - pitch is positive - add esi, 8 - add edi, 8 - - add edx, 8 - cmp edx, cols - - jl nextcol - - // done with the all cols, start the across filtering in place - sub esi, edx - sub edi, edx - - xor edx, edx - movq mm0, QWORD PTR [edi-8]; - - acrossnextcol: - movq xmm7, QWORD PTR [edi +edx -2] - movd xmm4, DWORD PTR [edi +edx +6] - - pslldq xmm4, 8 - por xmm4, xmm7 - - movdqa xmm3, xmm4 - psrldq xmm3, 2 - punpcklbw xmm3, xmm0 ; - mm3 = p0..p3 - movdqa xmm1, xmm3 ; - mm1 = p0..p3 - psllw xmm3, 2 - - - movdqa xmm5, xmm4 - psrldq xmm5, 3 - punpcklbw xmm5, xmm0 ; - mm5 = p1..p4 - paddusw xmm3, xmm5 ; - mm3 += mm6 - - ; - thresholding - movdqa xmm7, xmm1 ; - mm7 = p0..p3 - psubusw xmm7, xmm5 ; - mm7 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm7, xmm5 ; - mm7 = abs(p0..p3 - p1..p4) - pcmpgtw xmm7, xmm2 - - movdqa xmm5, xmm4 - psrldq xmm5, 4 - punpcklbw xmm5, xmm0 ; - mm5 = p2..p5 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = p0..p3 - psubusw xmm6, xmm5 ; - mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - - movdqa xmm5, xmm4 ; - mm5 = p-2..p5 - punpcklbw xmm5, xmm0 ; - mm5 = p-2..p1 - paddusw xmm3, xmm5 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = p0..p3 - psubusw xmm6, xmm5 ; - mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - psrldq xmm4, 1 ; - mm4 = p-1..p5 - punpcklbw xmm4, xmm0 ; - mm4 = p-1..p2 - paddusw xmm3, xmm4 ; - mm3 += mm5 - - ; - thresholding - movdqa xmm6, xmm1 ; - mm6 = p0..p3 - psubusw xmm6, xmm4 ; - mm6 = p0..p3 - p1..p4 - psubusw xmm4, xmm1 ; - mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm4 ; - mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; - accumulate thresholds - - paddusw xmm3, rd42 ; - mm3 += round value - psraw xmm3, 3 ; - mm3 /= 8 - - pand xmm1, xmm7 ; - mm1 select vals > thresh from source - pandn xmm7, xmm3 ; - mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; - combination - - packuswb xmm1, xmm0 ; - pack to bytes - movq QWORD PTR [edi+edx-8], mm0 ; - store previous four bytes - movdq2q mm0, xmm1 - - add edx, 8 - cmp edx, cols - jl acrossnextcol; - - // last 8 pixels - movq QWORD PTR [edi+edx-8], mm0 - - // done with this rwo - add esi, eax ; - next line - mov eax, dst_pixels_per_line ; - destination pitch? - add edi, eax ; - next destination - mov eax, src_pixels_per_line ; - destination pitch? - - dec ecx ; - decrement count - jnz nextrow ; - next row - } -} - - -void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit) -{ - int c, i; - __declspec(align(16)) - int flimit2[2]; - __declspec(align(16)) - unsigned char d[16][8]; - - flimit = vp8_q2mbl(flimit); - - for (i = 0; i < 2; i++) - flimit2[i] = flimit; - - rows += 8; - - for (c = 0; c < cols; c += 4) - { - unsigned char *s = &dst[c]; - - __asm - { - mov esi, s ; - pxor mm0, mm0 ; - - mov eax, pitch ; - neg eax // eax = -pitch - - lea esi, [esi + eax*8]; // edi = s[-pitch*8] - neg eax - - - pxor mm5, mm5 - pxor mm6, mm6 ; - - pxor mm7, mm7 ; - mov edi, esi - - mov ecx, 15 ; - - loop_initvar: - movd mm1, DWORD PTR [edi]; - punpcklbw mm1, mm0 ; - - paddw mm5, mm1 ; - pmullw mm1, mm1 ; - - movq mm2, mm1 ; - punpcklwd mm1, mm0 ; - - punpckhwd mm2, mm0 ; - paddd mm6, mm1 ; - - paddd mm7, mm2 ; - lea edi, [edi+eax] ; - - dec ecx - jne loop_initvar - //save the var and sum - xor edx, edx - loop_row: - movd mm1, DWORD PTR [esi] // [s-pitch*8] - movd mm2, DWORD PTR [edi] // [s+pitch*7] - - punpcklbw mm1, mm0 - punpcklbw mm2, mm0 - - paddw mm5, mm2 - psubw mm5, mm1 - - pmullw mm2, mm2 - movq mm4, mm2 - - punpcklwd mm2, mm0 - punpckhwd mm4, mm0 - - paddd mm6, mm2 - paddd mm7, mm4 - - pmullw mm1, mm1 - movq mm2, mm1 - - punpcklwd mm1, mm0 - psubd mm6, mm1 - - punpckhwd mm2, mm0 - psubd mm7, mm2 - - - movq mm3, mm6 - pslld mm3, 4 - - psubd mm3, mm6 - movq mm1, mm5 - - movq mm4, mm5 - pmullw mm1, mm1 - - pmulhw mm4, mm4 - movq mm2, mm1 - - punpcklwd mm1, mm4 - punpckhwd mm2, mm4 - - movq mm4, mm7 - pslld mm4, 4 - - psubd mm4, mm7 - - psubd mm3, mm1 - psubd mm4, mm2 - - psubd mm3, flimit2 - psubd mm4, flimit2 - - psrad mm3, 31 - psrad mm4, 31 - - packssdw mm3, mm4 - packsswb mm3, mm0 - - movd mm1, DWORD PTR [esi+eax*8] - - movq mm2, mm1 - punpcklbw mm1, mm0 - - paddw mm1, mm5 - mov ecx, edx - - and ecx, 127 - movq mm4, vp8_rv[ecx*2] - - paddw mm1, mm4 - //paddw xmm1, eight8s - psraw mm1, 4 - - packuswb mm1, mm0 - pand mm1, mm3 - - pandn mm3, mm2 - por mm1, mm3 - - and ecx, 15 - movd DWORD PTR d[ecx*4], mm1 - - mov ecx, edx - sub ecx, 8 - - and ecx, 15 - movd mm1, DWORD PTR d[ecx*4] - - movd [esi], mm1 - lea esi, [esi+eax] - - lea edi, [edi+eax] - add edx, 1 - - cmp edx, rows - jl loop_row - - } - - } -} - -void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit) -{ - int c, i; - __declspec(align(16)) - int flimit4[4]; - __declspec(align(16)) - unsigned char d[16][8]; - - flimit = vp8_q2mbl(flimit); - - for (i = 0; i < 4; i++) - flimit4[i] = flimit; - - rows += 8; - - for (c = 0; c < cols; c += 8) - { - unsigned char *s = &dst[c]; - - __asm - { - mov esi, s ; - pxor xmm0, xmm0 ; - - mov eax, pitch ; - neg eax // eax = -pitch - - lea esi, [esi + eax*8]; // edi = s[-pitch*8] - neg eax - - - pxor xmm5, xmm5 - pxor xmm6, xmm6 ; - - pxor xmm7, xmm7 ; - mov edi, esi - - mov ecx, 15 ; - - loop_initvar: - movq xmm1, QWORD PTR [edi]; - punpcklbw xmm1, xmm0 ; - - paddw xmm5, xmm1 ; - pmullw xmm1, xmm1 ; - - movdqa xmm2, xmm1 ; - punpcklwd xmm1, xmm0 ; - - punpckhwd xmm2, xmm0 ; - paddd xmm6, xmm1 ; - - paddd xmm7, xmm2 ; - lea edi, [edi+eax] ; - - dec ecx - jne loop_initvar - //save the var and sum - xor edx, edx - loop_row: - movq xmm1, QWORD PTR [esi] // [s-pitch*8] - movq xmm2, QWORD PTR [edi] // [s+pitch*7] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - paddw xmm5, xmm2 - psubw xmm5, xmm1 - - pmullw xmm2, xmm2 - movdqa xmm4, xmm2 - - punpcklwd xmm2, xmm0 - punpckhwd xmm4, xmm0 - - paddd xmm6, xmm2 - paddd xmm7, xmm4 - - pmullw xmm1, xmm1 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm0 - psubd xmm6, xmm1 - - punpckhwd xmm2, xmm0 - psubd xmm7, xmm2 - - - movdqa xmm3, xmm6 - pslld xmm3, 4 - - psubd xmm3, xmm6 - movdqa xmm1, xmm5 - - movdqa xmm4, xmm5 - pmullw xmm1, xmm1 - - pmulhw xmm4, xmm4 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm4 - punpckhwd xmm2, xmm4 - - movdqa xmm4, xmm7 - pslld xmm4, 4 - - psubd xmm4, xmm7 - - psubd xmm3, xmm1 - psubd xmm4, xmm2 - - psubd xmm3, flimit4 - psubd xmm4, flimit4 - - psrad xmm3, 31 - psrad xmm4, 31 - - packssdw xmm3, xmm4 - packsswb xmm3, xmm0 - - movq xmm1, QWORD PTR [esi+eax*8] - - movq xmm2, xmm1 - punpcklbw xmm1, xmm0 - - paddw xmm1, xmm5 - mov ecx, edx - - and ecx, 127 - movdqu xmm4, vp8_rv[ecx*2] - - paddw xmm1, xmm4 - //paddw xmm1, eight8s - psraw xmm1, 4 - - packuswb xmm1, xmm0 - pand xmm1, xmm3 - - pandn xmm3, xmm2 - por xmm1, xmm3 - - and ecx, 15 - movq QWORD PTR d[ecx*8], xmm1 - - mov ecx, edx - sub ecx, 8 - - and ecx, 15 - movq mm0, d[ecx*8] - - movq [esi], mm0 - lea esi, [esi+eax] - - lea edi, [edi+eax] - add edx, 1 - - cmp edx, rows - jl loop_row - - } - - } -} -#if 0 -/**************************************************************************** - * - * ROUTINE : plane_add_noise_wmt - * - * INPUTS : unsigned char *Start starting address of buffer to add gaussian - * noise to - * unsigned int Width width of plane - * unsigned int Height height of plane - * int Pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ -void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - char char_dist[300]; - char Rand[2048]; - double sigma; -// return; - __asm emms - sigma = a + .5 + .6 * (63 - q) / 63.0; - - // set up a lookup table of 256 entries that matches - // a gaussian distribution with sigma determined by q. - // - { - double i; - int next, j; - - next = 0; - - for (i = -32; i < 32; i++) - { - double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i); - int a = (int)(g + .5); - - if (a) - { - for (j = 0; j < a; j++) - { - char_dist[next+j] = (char) i; - } - - next = next + j; - } - - } - - for (next = next; next < 256; next++) - char_dist[next] = 0; - - } - - for (i = 0; i < 2048; i++) - { - Rand[i] = char_dist[rand() & 0xff]; - } - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -char_dist[0]; - whiteclamp[i] = -char_dist[0]; - bothclamp[i] = -2 * char_dist[0]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = Rand + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movdqu xmm1, [esi+eax] // get the source - - psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb xmm1, bothclamp - psubusb xmm1, whiteclamp - - movdqu xmm2, [edi+eax] // get the noise for this line - paddb xmm1, xmm2 // add it in - movdqu [esi+eax], xmm1 // store the result - - add eax, 16 // move to the next line - - cmp eax, ecx - jl nextset - - - } - - } -} -#endif -__declspec(align(16)) -static const int four8s[4] = { 8, 8, 8, 8}; -void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit) -{ - int r, i; - __declspec(align(16)) - int flimit4[4]; - unsigned char *s = src; - int sumsq; - int sum; - - - flimit = vp8_q2mbl(flimit); - flimit4[0] = - flimit4[1] = - flimit4[2] = - flimit4[3] = flimit; - - for (r = 0; r < rows; r++) - { - - - sumsq = 0; - sum = 0; - - for (i = -8; i <= 6; i++) - { - sumsq += s[i] * s[i]; - sum += s[i]; - } - - __asm - { - mov eax, sumsq - movd xmm7, eax - - mov eax, sum - movd xmm6, eax - - mov esi, s - xor ecx, ecx - - mov edx, cols - add edx, 8 - pxor mm0, mm0 - pxor mm1, mm1 - - pxor xmm0, xmm0 - nextcol4: - - movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5 - movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10 - - punpcklbw xmm1, xmm0 // expanding - punpcklbw xmm2, xmm0 // expanding - - punpcklwd xmm1, xmm0 // expanding to dwords - punpcklwd xmm2, xmm0 // expanding to dwords - - psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5 - paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2 - - paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5 - pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5 - - paddd xmm6, xmm2 - paddd xmm7, xmm1 - - pshufd xmm6, xmm6, 0 // duplicate the last ones - pshufd xmm7, xmm7, 0 // duplicate the last ones - - psrldq xmm1, 4 // 8--7 9--6 10--5 0000 - psrldq xmm2, 4 // 8--7 9--6 10--5 0000 - - pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared - pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared - - paddd xmm6, xmm4 - paddd xmm7, xmm3 - - pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared - pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared - - paddd xmm7, xmm3 - paddd xmm6, xmm4 - - pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared - pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared - - paddd xmm7, xmm3 - paddd xmm6, xmm4 - - movdqa xmm3, xmm6 - pmaddwd xmm3, xmm3 - - movdqa xmm5, xmm7 - pslld xmm5, 4 - - psubd xmm5, xmm7 - psubd xmm5, xmm3 - - psubd xmm5, flimit4 - psrad xmm5, 31 - - packssdw xmm5, xmm0 - packsswb xmm5, xmm0 - - movd xmm1, DWORD PTR [esi+ecx] - movq xmm2, xmm1 - - punpcklbw xmm1, xmm0 - punpcklwd xmm1, xmm0 - - paddd xmm1, xmm6 - paddd xmm1, four8s - - psrad xmm1, 4 - packssdw xmm1, xmm0 - - packuswb xmm1, xmm0 - pand xmm1, xmm5 - - pandn xmm5, xmm2 - por xmm5, xmm1 - - movd [esi+ecx-8], mm0 - movq mm0, mm1 - - movdq2q mm1, xmm5 - psrldq xmm7, 12 - - psrldq xmm6, 12 - add ecx, 4 - - cmp ecx, edx - jl nextcol4 - - } - s += pitch; - } -} - -#if 0 - -/**************************************************************************** - * - * ROUTINE : plane_add_noise_mmx - * - * INPUTS : unsigned char *Start starting address of buffer to add gaussian - * noise to - * unsigned int Width width of plane - * unsigned int Height height of plane - * int Pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ -void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - int Pitch4 = Pitch * 4; - const int noise_amount = 2; - const int noise_adder = 2 * noise_amount + 1; - - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - - char char_dist[300]; - char Rand[2048]; - - double sigma; - __asm emms - sigma = a + .5 + .6 * (63 - q) / 63.0; - - // set up a lookup table of 256 entries that matches - // a gaussian distribution with sigma determined by q. - // - { - double i, sum = 0; - int next, j; - - next = 0; - - for (i = -32; i < 32; i++) - { - int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i)); - - if (a) - { - for (j = 0; j < a; j++) - { - char_dist[next+j] = (char) i; - } - - next = next + j; - } - - } - - for (next = next; next < 256; next++) - char_dist[next] = 0; - - } - - for (i = 0; i < 2048; i++) - { - Rand[i] = char_dist[rand() & 0xff]; - } - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -char_dist[0]; - whiteclamp[i] = -char_dist[0]; - bothclamp[i] = -2 * char_dist[0]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = Rand + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movq mm1, [esi+eax] // get the source - - psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb mm1, bothclamp - psubusb mm1, whiteclamp - - movq mm2, [edi+eax] // get the noise for this line - paddb mm1, mm2 // add it in - movq [esi+eax], mm1 // store the result - - add eax, 8 // move to the next line - - cmp eax, ecx - jl nextset - - - } - - } -} -#else -extern char an[8][64][3072]; -extern int cd[8][64]; - -void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - - - __asm emms - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -cd[a][q]; - whiteclamp[i] = -cd[a][q]; - bothclamp[i] = -2 * cd[a][q]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = an[a][q] + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movq mm1, [esi+eax] // get the source - - psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb mm1, bothclamp - psubusb mm1, whiteclamp - - movq mm2, [edi+eax] // get the noise for this line - paddb mm1, mm2 // add it in - movq [esi+eax], mm1 // store the result - - add eax, 8 // move to the next line - - cmp eax, ecx - jl nextset - } - } -} - - -void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a) -{ - unsigned int i; - - __declspec(align(16)) unsigned char blackclamp[16]; - __declspec(align(16)) unsigned char whiteclamp[16]; - __declspec(align(16)) unsigned char bothclamp[16]; - - __asm emms - - for (i = 0; i < 16; i++) - { - blackclamp[i] = -cd[a][q]; - whiteclamp[i] = -cd[a][q]; - bothclamp[i] = -2 * cd[a][q]; - } - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = an[a][q] + (rand() & 0xff); - - __asm - { - mov ecx, [Width] - mov esi, Pos - mov edi, Ref - xor eax, eax - - nextset: - movdqu xmm1, [esi+eax] // get the source - - psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise - paddusb xmm1, bothclamp - psubusb xmm1, whiteclamp - - movdqu xmm2, [edi+eax] // get the noise for this line - paddb xmm1, xmm2 // add it in - movdqu [esi+eax], xmm1 // store the result - - add eax, 16 // move to the next line - - cmp eax, ecx - jl nextset - } - } -} - -#endif -- 2.40.0