;=============================================================================
%macro SSD_LOAD_FULL 5
- mova m1, [r0+%1]
- mova m2, [r2+%2]
- mova m3, [r0+%3]
- mova m4, [r2+%4]
-%if %5
- lea r0, [r0+2*r1]
- lea r2, [r2+2*r3]
+ mova m1, [t0+%1]
+ mova m2, [t2+%2]
+ mova m3, [t0+%3]
+ mova m4, [t2+%4]
+%if %5==1
+ add t0, t1
+ add t2, t3
+%elif %5==2
+ lea t0, [t0+2*t1]
+ lea t2, [t2+2*t3]
%endif
%endmacro
movh m%1, %3
movh m%2, %4
%if %5
- lea r0, [r0+2*r1]
+ lea t0, [t0+2*t1]
%endif
%endmacro
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklbw m%1, m7
punpcklbw m%3, m7
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklqdq m%1, m%2
punpcklqdq m%3, m%4
movh m%3, %5
movh m%4, %6
%if %7
- lea r2, [r2+2*r3]
+ lea t2, [t2+2*t3]
%endif
punpcklbw m%1, m%3
punpcklbw m%2, m%4
%endmacro
%macro SSD_LOAD_HALF 5
- LOAD 1, 2, [r0+%1], [r0+%3], 1
- JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1
- LOAD 3, 4, [r0+%1], [r0+%3], %5
- JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5
+ LOAD 1, 2, [t0+%1], [t0+%3], 1
+ JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1
+ LOAD 3, 4, [t0+%1], [t0+%3], %5
+ JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5
%endmacro
%macro SSD_CORE 7-8
mova m%2, m%1
mova m%4, m%3
punpckhbw m%1, m%5
- punpckhbw m%3, m%5
punpcklbw m%2, m%5
+ punpckhbw m%3, m%5
punpcklbw m%4, m%5
%endif
pmaddwd m%1, m%1
DEINTB %6, %1, %7, %2, %5
psubw m%6, m%7
psubw m%1, m%2
- SWAP %2, %6
+ SWAP %6, %2, %1
DEINTB %6, %3, %7, %4, %5
psubw m%6, m%7
psubw m%3, m%4
- SWAP %4, %6
+ SWAP %6, %4, %3
%endif
pmaddwd m%1, m%1
pmaddwd m%2, m%2
punpcklbw m%3, m%4
punpckhbw m%6, m%2
punpckhbw m%7, m%4
- SWAP %6, %2
+ SWAP %6, %2, %3
SWAP %7, %4
%endif
pmaddubsw m%1, m%5
pmaddwd m%4, m%4
%endmacro
-%macro SSD_END 1
+%macro SSD_ITER 6
+ SSD_LOAD_%1 %2,%3,%4,%5,%6
+ SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
paddd m1, m2
paddd m3, m4
-%if %1
paddd m0, m1
-%else
- SWAP 0, 1
-%endif
paddd m0, m3
%endmacro
-%macro SSD_ITER 7
- SSD_LOAD_%1 %2,%3,%4,%5,%7
- SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1
- SSD_END %6
-%endmacro
-
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3-4 0
-cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
+%if %1 != %2
+ %assign function_align 8
+%else
+ %assign function_align 16
+%endif
+cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0
+ mov al, %1*%2/mmsize/2
+
+%if %1 != %2
+ jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop)
+%else
+
+.startloop:
+%ifdef ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3
+%else
+ PROLOGUE 0,5
+ DECLARE_REG_TMP 1,2,3,4
+ mov t0, r0m
+ mov t1, r1m
+ mov t2, r2m
+ mov t3, r3m
+%endif
+
%ifidn %3, ssse3
mova m7, [hsub_mul GLOBAL]
%elifidn %3, sse2
%elif %1 >= mmsize
pxor m7, m7
%endif
-%assign i 0
-%rep %2/4
+ pxor m0, m0
+
+ALIGN 16
+.loop:
%if %1 > mmsize
- SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0
- SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1
- SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0
- SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1
+ SSD_ITER FULL, 0, 0, mmsize, mmsize, 1
%elif %1 == mmsize
- SSD_ITER FULL, 0, 0, r1, r3, i, 1
- SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1
+ SSD_ITER FULL, 0, 0, t1, t3, 2
%else
- SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1
+ SSD_ITER HALF, 0, 0, t1, t3, 2
%endif
-%assign i i+1
-%endrep
+ dec al
+ jg .loop
HADDD m0, m1
movd eax, m0
RET
+%endif
%endmacro
INIT_MMX
SSD 16, 16, mmx
SSD 16, 8, mmx
-SSD 8, 16, mmx
SSD 8, 8, mmx
+SSD 8, 16, mmx
+SSD 4, 4, mmx
SSD 8, 4, mmx
SSD 4, 8, mmx
-SSD 4, 4, mmx
INIT_XMM
SSD 16, 16, sse2slow, 8
+SSD 8, 8, sse2slow, 8
SSD 16, 8, sse2slow, 8
SSD 8, 16, sse2slow, 8
-SSD 8, 8, sse2slow, 8
SSD 8, 4, sse2slow, 8
%define SSD_CORE SSD_CORE_SSE2
%define JOIN JOIN_SSE2
SSD 16, 16, sse2, 8
+SSD 8, 8, sse2, 8
SSD 16, 8, sse2, 8
SSD 8, 16, sse2, 8
-SSD 8, 8, sse2, 8
SSD 8, 4, sse2, 8
%define SSD_CORE SSD_CORE_SSSE3
%define JOIN JOIN_SSSE3
SSD 16, 16, ssse3, 8
+SSD 8, 8, ssse3, 8
SSD 16, 8, ssse3, 8
SSD 8, 16, ssse3, 8
-SSD 8, 8, ssse3, 8
SSD 8, 4, ssse3, 8
INIT_MMX
-SSD 4, 8, ssse3
SSD 4, 4, ssse3
+SSD 4, 8, ssse3
+%assign function_align 16
;=============================================================================
; variance