mova [r2+r4+mmsize], m4
paddw m1, s30
paddw m4, s30
- add r4, 2*mmsize
FILT_PACK m1, m4, 5, m6, w, s10
CLIPW m1, m0, m7
CLIPW m4, m0, m7
- mova [r0+r4-mmsize*2], m1
- mova [r0+r4-mmsize*1], m4
+ mova [r0+r4], m1
+ mova [r0+r4+mmsize], m4
+ add r4, 2*mmsize
jl .loop
REP_RET
FILT_H2 m1, m2, m3, m4, m5, m6
mova m7, [pw_1]
pxor m2, m2
- add r2, mmsize*2
FILT_PACK m1, m4, 1, m7, w
CLIPW m1, m2, m0
CLIPW m4, m2, m0
- mova [r0+r2-mmsize*2], m1
- mova [r0+r2-mmsize*1], m4
+ mova [r0+r2], m1
+ mova [r0+r2+mmsize], m4
+ add r2, mmsize*2
jl .loop
REP_RET
%endmacro ; HPEL_FILTER
cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
jz .copy32start
- sub r2d, 16
- movq mm0, [r1 + r2 + 0]
- movq mm1, [r1 + r2 + 8]
- movq [r0 + r2 + 0], mm0
- movq [r0 + r2 + 8], mm1
+ movq mm0, [r1 + r2 - 16]
+ movq mm1, [r1 + r2 - 8]
+ movq [r0 + r2 - 16], mm0
+ movq [r0 + r2 - 8], mm1
+ sub r2d, 16
.copy32start
test r2d, r2d
jz .ret
.copy32:
- sub r2d, 32
- movq mm0, [r1 + r2 + 0]
- movq mm1, [r1 + r2 + 8]
- movq mm2, [r1 + r2 + 16]
- movq mm3, [r1 + r2 + 24]
- movq [r0 + r2 + 0], mm0
- movq [r0 + r2 + 8], mm1
- movq [r0 + r2 + 16], mm2
- movq [r0 + r2 + 24], mm3
+ movq mm0, [r1 + r2 - 32]
+ movq mm1, [r1 + r2 - 24]
+ movq mm2, [r1 + r2 - 16]
+ movq mm3, [r1 + r2 - 8]
+ movq [r0 + r2 - 32], mm0
+ movq [r0 + r2 - 24], mm1
+ movq [r0 + r2 - 16], mm2
+ movq [r0 + r2 - 8], mm3
+ sub r2d, 32
jg .copy32
.ret
REP_RET
cglobal memcpy_aligned_sse2, 3,3
test r2d, 16
jz .copy32
- sub r2d, 16
- movdqa xmm0, [r1 + r2]
- movdqa [r0 + r2], xmm0
+ movdqa xmm0, [r1 + r2 - 16]
+ movdqa [r0 + r2 - 16], xmm0
+ sub r2d, 16
.copy32:
test r2d, 32
jz .copy64start
- sub r2d, 32
- movdqa xmm0, [r1 + r2 + 0]
- movdqa [r0 + r2 + 0], xmm0
- movdqa xmm1, [r1 + r2 + 16]
- movdqa [r0 + r2 + 16], xmm1
+ movdqa xmm0, [r1 + r2 - 32]
+ movdqa [r0 + r2 - 32], xmm0
+ movdqa xmm1, [r1 + r2 - 16]
+ movdqa [r0 + r2 - 16], xmm1
+ sub r2d, 32
.copy64start
test r2d, r2d
jz .ret
.copy64:
- sub r2d, 64
- movdqa xmm0, [r1 + r2 + 0]
- movdqa [r0 + r2 + 0], xmm0
- movdqa xmm1, [r1 + r2 + 16]
- movdqa [r0 + r2 + 16], xmm1
- movdqa xmm2, [r1 + r2 + 32]
- movdqa [r0 + r2 + 32], xmm2
- movdqa xmm3, [r1 + r2 + 48]
- movdqa [r0 + r2 + 48], xmm3
+ movdqa xmm0, [r1 + r2 - 64]
+ movdqa [r0 + r2 - 64], xmm0
+ movdqa xmm1, [r1 + r2 - 48]
+ movdqa [r0 + r2 - 48], xmm1
+ movdqa xmm2, [r1 + r2 - 32]
+ movdqa [r0 + r2 - 32], xmm2
+ movdqa xmm3, [r1 + r2 - 16]
+ movdqa [r0 + r2 - 16], xmm3
+ sub r2d, 64
jg .copy64
.ret:
REP_RET
mova m0, [r0+r2]
mova m4, [r4+r2]
.loop:
- sub r2, 8
mova m1, m4
psubw m1, m0
- mova m4, [r4+r2]
- mova m0, [r0+r2]
+ mova m4, [r4+r2-8]
+ mova m0, [r0+r2-8]
paddw m1, m4
- mova m3, [r3+r2]
+ mova m3, [r3+r2-8]
psubw m1, m0
psubw m3, m0
- mova [r0+r2], m1
- mova [r1+r2], m3
+ mova [r0+r2-8], m1
+ mova [r1+r2-8], m3
+ sub r2, 8
jge .loop
REP_RET
cglobal denoise_dct, 4,4,8
pxor m6, m6
.loop:
- sub r3, mmsize/2
- mova m2, [r0+r3*4+0*mmsize]
- mova m3, [r0+r3*4+1*mmsize]
+ mova m2, [r0+r3*4-2*mmsize]
+ mova m3, [r0+r3*4-1*mmsize]
ABSD m0, m2
ABSD m1, m3
mova m4, m0
mova m5, m1
- psubd m0, [r2+r3*4+0*mmsize]
- psubd m1, [r2+r3*4+1*mmsize]
+ psubd m0, [r2+r3*4-2*mmsize]
+ psubd m1, [r2+r3*4-1*mmsize]
pcmpgtd m7, m0, m6
pand m0, m7
pcmpgtd m7, m1, m6
pand m1, m7
PSIGND m0, m2
PSIGND m1, m3
- mova [r0+r3*4+0*mmsize], m0
- mova [r0+r3*4+1*mmsize], m1
- paddd m4, [r1+r3*4+0*mmsize]
- paddd m5, [r1+r3*4+1*mmsize]
- mova [r1+r3*4+0*mmsize], m4
- mova [r1+r3*4+1*mmsize], m5
+ mova [r0+r3*4-2*mmsize], m0
+ mova [r0+r3*4-1*mmsize], m1
+ paddd m4, [r1+r3*4-2*mmsize]
+ paddd m5, [r1+r3*4-1*mmsize]
+ mova [r1+r3*4-2*mmsize], m4
+ mova [r1+r3*4-1*mmsize], m5
+ sub r3, mmsize/2
jg .loop
REP_RET
%endmacro
cglobal denoise_dct, 4,4,7
pxor m6, m6
.loop:
- sub r3, mmsize
- mova m2, [r0+r3*2+0*mmsize]
- mova m3, [r0+r3*2+1*mmsize]
+ mova m2, [r0+r3*2-2*mmsize]
+ mova m3, [r0+r3*2-1*mmsize]
ABSW m0, m2, sign
ABSW m1, m3, sign
- psubusw m4, m0, [r2+r3*2+0*mmsize]
- psubusw m5, m1, [r2+r3*2+1*mmsize]
+ psubusw m4, m0, [r2+r3*2-2*mmsize]
+ psubusw m5, m1, [r2+r3*2-1*mmsize]
PSIGNW m4, m2
PSIGNW m5, m3
- mova [r0+r3*2+0*mmsize], m4
- mova [r0+r3*2+1*mmsize], m5
+ mova [r0+r3*2-2*mmsize], m4
+ mova [r0+r3*2-1*mmsize], m5
punpcklwd m2, m0, m6
punpcklwd m3, m1, m6
punpckhwd m0, m6
punpckhwd m1, m6
- paddd m2, [r1+r3*4+0*mmsize]
- paddd m0, [r1+r3*4+1*mmsize]
- paddd m3, [r1+r3*4+2*mmsize]
- paddd m1, [r1+r3*4+3*mmsize]
- mova [r1+r3*4+0*mmsize], m2
- mova [r1+r3*4+1*mmsize], m0
- mova [r1+r3*4+2*mmsize], m3
- mova [r1+r3*4+3*mmsize], m1
+ paddd m2, [r1+r3*4-4*mmsize]
+ paddd m0, [r1+r3*4-3*mmsize]
+ paddd m3, [r1+r3*4-2*mmsize]
+ paddd m1, [r1+r3*4-1*mmsize]
+ mova [r1+r3*4-4*mmsize], m2
+ mova [r1+r3*4-3*mmsize], m0
+ mova [r1+r3*4-2*mmsize], m3
+ mova [r1+r3*4-1*mmsize], m1
+ sub r3, mmsize
jg .loop
REP_RET
%endmacro
or r1, r2
xor r1, -1
je .ret
- or eax, r3d
+ add eax, r3d
jne .ret9
.loop:
bsf rcx, r1
je .tryret
xor r4, -1
.cont:
- or r0, r2
+ add r0, r2
jne .ret9 ;r0 is zero at this point, so we don't need to zero it
.loop:
bsf ecx, r3