From 99cf5bf62a738b05c7168f04e344d6d596c874d3 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Fri, 30 Oct 2009 18:58:03 -0700 Subject: [PATCH] Dramatically reduce size of pixel_ssd_* asm functions ~10k of code size eliminated. --- common/x86/pixel-a.asm | 114 ++++++++++++++++++++++++----------------- common/x86/x86inc.asm | 14 ++--- 2 files changed, 74 insertions(+), 54 deletions(-) diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 6a235c3d..a650b84e 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -77,13 +77,16 @@ SECTION .text ;============================================================================= %macro SSD_LOAD_FULL 5 - mova m1, [r0+%1] - mova m2, [r2+%2] - mova m3, [r0+%3] - mova m4, [r2+%4] -%if %5 - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] + mova m1, [t0+%1] + mova m2, [t2+%2] + mova m3, [t0+%3] + mova m4, [t2+%4] +%if %5==1 + add t0, t1 + add t2, t3 +%elif %5==2 + lea t0, [t0+2*t1] + lea t2, [t2+2*t3] %endif %endmacro @@ -91,7 +94,7 @@ SECTION .text movh m%1, %3 movh m%2, %4 %if %5 - lea r0, [r0+2*r1] + lea t0, [t0+2*t1] %endif %endmacro @@ -99,7 +102,7 @@ SECTION .text movh m%3, %5 movh m%4, %6 %if %7 - lea r2, [r2+2*r3] + lea t2, [t2+2*t3] %endif punpcklbw m%1, m7 punpcklbw m%3, m7 @@ -113,7 +116,7 @@ SECTION .text movh m%3, %5 movh m%4, %6 %if %7 - lea r2, [r2+2*r3] + lea t2, [t2+2*t3] %endif punpcklqdq m%1, m%2 punpcklqdq m%3, m%4 @@ -126,17 +129,17 @@ SECTION .text movh m%3, %5 movh m%4, %6 %if %7 - lea r2, [r2+2*r3] + lea t2, [t2+2*t3] %endif punpcklbw m%1, m%3 punpcklbw m%2, m%4 %endmacro %macro SSD_LOAD_HALF 5 - LOAD 1, 2, [r0+%1], [r0+%3], 1 - JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1 - LOAD 3, 4, [r0+%1], [r0+%3], %5 - JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5 + LOAD 1, 2, [t0+%1], [t0+%3], 1 + JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 + LOAD 3, 4, [t0+%1], [t0+%3], %5 + JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5 %endmacro %macro SSD_CORE 7-8 @@ -152,8 +155,8 @@ SECTION .text mova m%2, m%1 mova m%4, m%3 punpckhbw m%1, m%5 - punpckhbw m%3, m%5 punpcklbw m%2, m%5 + punpckhbw m%3, m%5 punpcklbw m%4, m%5 %endif pmaddwd m%1, m%1 @@ -167,11 +170,11 @@ SECTION .text DEINTB %6, %1, %7, %2, %5 psubw m%6, m%7 psubw m%1, m%2 - SWAP %2, %6 + SWAP %6, %2, %1 DEINTB %6, %3, %7, %4, %5 psubw m%6, m%7 psubw m%3, m%4 - SWAP %4, %6 + SWAP %6, %4, %3 %endif pmaddwd m%1, m%1 pmaddwd m%2, m%2 @@ -187,7 +190,7 @@ SECTION .text punpcklbw m%3, m%4 punpckhbw m%6, m%2 punpckhbw m%7, m%4 - SWAP %6, %2 + SWAP %6, %2, %3 SWAP %7, %4 %endif pmaddubsw m%1, m%5 @@ -200,28 +203,43 @@ SECTION .text pmaddwd m%4, m%4 %endmacro -%macro SSD_END 1 +%macro SSD_ITER 6 + SSD_LOAD_%1 %2,%3,%4,%5,%6 + SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 paddd m1, m2 paddd m3, m4 -%if %1 paddd m0, m1 -%else - SWAP 0, 1 -%endif paddd m0, m3 %endmacro -%macro SSD_ITER 7 - SSD_LOAD_%1 %2,%3,%4,%5,%7 - SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 - SSD_END %6 -%endmacro - ;----------------------------------------------------------------------------- ; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- %macro SSD 3-4 0 -cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4 +%if %1 != %2 + %assign function_align 8 +%else + %assign function_align 16 +%endif +cglobal x264_pixel_ssd_%1x%2_%3, 0,0,0 + mov al, %1*%2/mmsize/2 + +%if %1 != %2 + jmp mangle(x264_pixel_ssd_%1x%1_%3.startloop) +%else + +.startloop: +%ifdef ARCH_X86_64 + DECLARE_REG_TMP 0,1,2,3 +%else + PROLOGUE 0,5 + DECLARE_REG_TMP 1,2,3,4 + mov t0, r0m + mov t1, r1m + mov t2, r2m + mov t3, r3m +%endif + %ifidn %3, ssse3 mova m7, [hsub_mul GLOBAL] %elifidn %3, sse2 @@ -229,57 +247,57 @@ cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4 %elif %1 >= mmsize pxor m7, m7 %endif -%assign i 0 -%rep %2/4 + pxor m0, m0 + +ALIGN 16 +.loop: %if %1 > mmsize - SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0 - SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1 - SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0 - SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1 + SSD_ITER FULL, 0, 0, mmsize, mmsize, 1 %elif %1 == mmsize - SSD_ITER FULL, 0, 0, r1, r3, i, 1 - SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1 + SSD_ITER FULL, 0, 0, t1, t3, 2 %else - SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1 + SSD_ITER HALF, 0, 0, t1, t3, 2 %endif -%assign i i+1 -%endrep + dec al + jg .loop HADDD m0, m1 movd eax, m0 RET +%endif %endmacro INIT_MMX SSD 16, 16, mmx SSD 16, 8, mmx -SSD 8, 16, mmx SSD 8, 8, mmx +SSD 8, 16, mmx +SSD 4, 4, mmx SSD 8, 4, mmx SSD 4, 8, mmx -SSD 4, 4, mmx INIT_XMM SSD 16, 16, sse2slow, 8 +SSD 8, 8, sse2slow, 8 SSD 16, 8, sse2slow, 8 SSD 8, 16, sse2slow, 8 -SSD 8, 8, sse2slow, 8 SSD 8, 4, sse2slow, 8 %define SSD_CORE SSD_CORE_SSE2 %define JOIN JOIN_SSE2 SSD 16, 16, sse2, 8 +SSD 8, 8, sse2, 8 SSD 16, 8, sse2, 8 SSD 8, 16, sse2, 8 -SSD 8, 8, sse2, 8 SSD 8, 4, sse2, 8 %define SSD_CORE SSD_CORE_SSSE3 %define JOIN JOIN_SSSE3 SSD 16, 16, ssse3, 8 +SSD 8, 8, ssse3, 8 SSD 16, 8, ssse3, 8 SSD 8, 16, ssse3, 8 -SSD 8, 8, ssse3, 8 SSD 8, 4, ssse3, 8 INIT_MMX -SSD 4, 8, ssse3 SSD 4, 4, ssse3 +SSD 4, 8, ssse3 +%assign function_align 16 ;============================================================================= ; variance diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index d42f4f38..753fa963 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -40,6 +40,12 @@ %endif %endif +%ifdef PREFIX + %define mangle(x) _ %+ x +%else + %define mangle(x) x +%endif + ; FIXME: All of the 64bit asm functions that take a stride as an argument ; via register, assume that the high dword of that register is filled with 0. ; This is true in practice (since we never do any 64bit arithmetic on strides, @@ -446,9 +452,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] ; Symbol prefix for C linkage %macro cglobal 1-2+ - %ifdef PREFIX - %xdefine %1 _ %+ %1 - %endif + %xdefine %1 mangle(%1) %xdefine %1.skip_prologue %1 %+ .skip_prologue %ifidn __OUTPUT_FORMAT__,elf global %1:function hidden @@ -465,9 +469,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %endmacro %macro cextern 1 - %ifdef PREFIX - %xdefine %1 _%1 - %endif + %xdefine %1 mangle(%1) extern %1 %endmacro -- 2.40.0