From d2ab724f262f831a320ba75b81092bc182bca695 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Wed, 12 Apr 2006 06:28:52 +0000 Subject: [PATCH] 3% faster satd_mmx git-svn-id: svn://svn.videolan.org/x264/trunk@497 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/pixel-a.asm | 96 ++++++++++++++--------------- common/i386/pixel-a.asm | 130 ++++++++++++++++++--------------------- 2 files changed, 105 insertions(+), 121 deletions(-) diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm index d11d502e..be067d89 100644 --- a/common/amd64/pixel-a.asm +++ b/common/amd64/pixel-a.asm @@ -326,16 +326,6 @@ BITS 64 psubw %1, %2 %endmacro -; in: %1 = horizontal offset -; out: mm4..mm7 = 16bit diffs -; clobber: mm3 -%macro LOAD_DIFF_4x4 1 - LOAD_DIFF_4P mm4, mm3, [parm1q+%1], [parm3q+%1] - LOAD_DIFF_4P mm5, mm3, [parm1q+parm2q+%1], [parm3q+parm4q+%1] - LOAD_DIFF_4P mm6, mm3, [parm1q+2*parm2q+%1], [parm3q+2*parm4q+%1] - LOAD_DIFF_4P mm7, mm3, [parm1q+r10+%1], [parm3q+r11+%1] -%endmacro - %macro HADAMARD4_SUB_BADC 4 paddw %1, %2 paddw %3, %4 @@ -369,20 +359,21 @@ BITS 64 SBUTTERFLYdq %5, %2, %3 %endmacro -%macro MMX_ABS 2 ; mma, mmt - pxor %2, %2 - psubw %2, %1 - pmaxsw %1, %2 +%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 %endmacro %macro HADAMARD4x4_SUM 1 ; %1 = dest (row sum of one block) HADAMARD4x4 mm4, mm5, mm6, mm7 TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1 HADAMARD4x4 mm4, mm7, %1, mm6 - MMX_ABS mm4, mm5 - MMX_ABS mm7, mm5 - MMX_ABS %1, mm5 - MMX_ABS mm6, mm5 + MMX_ABS_TWO mm4, mm7, mm3, mm5 + MMX_ABS_TWO %1, mm6, mm3, mm5 paddw %1, mm4 paddw mm6, mm7 pavgw %1, mm6 @@ -394,7 +385,10 @@ BITS 64 ; clobber: mm3..mm7 ; out: %1 = satd %macro LOAD_DIFF_HADAMARD_SUM 3 - LOAD_DIFF_4x4 %2 + LOAD_DIFF_4P mm4, mm3, [parm1q+%2], [parm3q+%2] + LOAD_DIFF_4P mm5, mm3, [parm1q+parm2q+%2], [parm3q+parm4q+%2] + LOAD_DIFF_4P mm6, mm3, [parm1q+2*parm2q+%2], [parm3q+2*parm4q+%2] + LOAD_DIFF_4P mm7, mm3, [parm1q+r10+%2], [parm3q+r11+%2] %if %3 lea parm1q, [parm1q+4*parm2q] lea parm3q, [parm3q+4*parm4q] @@ -642,10 +636,10 @@ x264_pixel_satd_8x8_mmxext: LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 4, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 + paddw mm0, mm1 SATD_END ALIGN 16 @@ -657,19 +651,19 @@ x264_pixel_satd_16x8_mmxext: LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 1 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 0 - paddw mm1, mm2 + LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 1 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 + paddw mm0, mm1 SATD_END ALIGN 16 @@ -681,19 +675,19 @@ x264_pixel_satd_8x16_mmxext: LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 4, 1 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 + LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm1, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 4, 1 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 paddw mm0, mm2 + paddw mm0, mm1 SATD_END ALIGN 16 @@ -705,37 +699,37 @@ x264_pixel_satd_16x16_mmxext: LOAD_DIFF_HADAMARD_SUM mm0, 0, 0 LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 1 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 0 - paddw mm1, mm2 + LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 1 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 0 - paddw mm1, mm2 + LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 1 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 paddw mm0, mm2 - LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 - LOAD_DIFF_HADAMARD_SUM mm2, 4, 0 - paddw mm1, mm2 + LOAD_DIFF_HADAMARD_SUM mm2, 0, 0 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 paddw mm0, mm2 + paddw mm0, mm1 pxor mm3, mm3 pshufw mm1, mm0, 01001110b diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index 82efecb5..d171c019 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -340,37 +340,6 @@ BITS 32 ; satd -%macro LOAD_DIFF_4P 4 ; MMP, MMT, [pix1], [pix2] - movd %1, %3 - movd %2, %4 - punpcklbw %1, %2 - punpcklbw %2, %2 - psubw %1, %2 -%endmacro - -; in: eax=pix1, ebx=stride1, ecx=pix2, edx=stride2, %1=horizontal offset -; out: mm4..mm7= 16bit diffs -; clobber: mm3 -%macro LOAD_DIFF_INC_4x4 1 - LOAD_DIFF_4P mm4, mm3, [eax+%1], [ecx+%1] - LOAD_DIFF_4P mm5, mm3, [eax+ebx+%1], [ecx+edx+%1] - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm6, mm3, [eax+%1], [ecx+%1] - LOAD_DIFF_4P mm7, mm3, [eax+ebx+%1], [ecx+edx+%1] - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] -%endmacro - -%macro LOAD_DIFF_4x4 1 - LOAD_DIFF_4P mm4, mm3, [eax+%1], [ecx+%1] - LOAD_DIFF_4P mm6, mm3, [eax+2*ebx+%1], [ecx+2*edx+%1] - add eax, ebx - add ecx, edx - LOAD_DIFF_4P mm5, mm3, [eax+%1], [ecx+%1] - LOAD_DIFF_4P mm7, mm3, [eax+2*ebx+%1], [ecx+2*edx+%1] -%endmacro - %macro HADAMARD4_SUB_BADC 4 paddw %1, %2 paddw %3, %4 @@ -404,34 +373,55 @@ BITS 32 SBUTTERFLYdq %5, %2, %3 %endmacro -%macro MMX_ABS 2 ; mma, mmt - pxor %2, %2 - psubw %2, %1 - pmaxsw %1, %2 +%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 + pxor %3, %3 + pxor %4, %4 + psubw %3, %1 + psubw %4, %2 + pmaxsw %1, %3 + pmaxsw %2, %4 %endmacro %macro HADAMARD4x4_SUM 1 ; %1 - dest (row sum of one block) HADAMARD4x4 mm4, mm5, mm6, mm7 TRANSPOSE4x4 mm4, mm5, mm6, mm7, %1 HADAMARD4x4 mm4, mm7, %1, mm6 - MMX_ABS mm4, mm5 - MMX_ABS mm7, mm5 - MMX_ABS %1, mm5 - MMX_ABS mm6, mm5 + MMX_ABS_TWO mm4, mm7, mm3, mm5 + MMX_ABS_TWO %1, mm6, mm3, mm5 paddw %1, mm4 paddw mm6, mm7 pavgw %1, mm6 %endmacro +%macro LOAD_DIFF_4P 3 ; mmp, dx, dy + movd %1, [eax+ebx*%3+%2] + movd mm3, [ecx+edx*%3+%2] + punpcklbw %1, mm3 + punpcklbw mm3, mm3 + psubw %1, mm3 +%endmacro + ; in: %2 = horizontal offset ; in: %3 = whether we need to increment pix1 and pix2 ; clobber: mm3..mm7 ; out: %1 = satd %macro LOAD_DIFF_HADAMARD_SUM 3 %if %3 - LOAD_DIFF_INC_4x4 %2 + LOAD_DIFF_4P mm4, %2, 0 + LOAD_DIFF_4P mm5, %2, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + LOAD_DIFF_4P mm6, %2, 0 + LOAD_DIFF_4P mm7, %2, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] %else - LOAD_DIFF_4x4 %2 + LOAD_DIFF_4P mm4, %2, 0 + LOAD_DIFF_4P mm6, %2, 2 + add eax, ebx + add ecx, edx + LOAD_DIFF_4P mm5, %2, 0 + LOAD_DIFF_4P mm7, %2, 2 %endif HADAMARD4x4_SUM %1 %endmacro @@ -697,10 +687,10 @@ x264_pixel_satd_8x8_mmxext: mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 4, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 + paddw mm0, mm1 SATD_END ALIGN 16 @@ -715,24 +705,24 @@ x264_pixel_satd_16x8_mmxext: mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 4, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm1, 8, 1 - LOAD_DIFF_HADAMARD_SUM mm2, 8, 0 - paddw mm1, mm2 + LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 8, 0 + paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 paddw mm0, mm2 + paddw mm0, mm1 SATD_END ALIGN 16 @@ -744,21 +734,21 @@ x264_pixel_satd_8x16_mmxext: LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 0, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm1, mm2 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 4, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 + paddw mm0, mm1 SATD_END ALIGN 16 @@ -770,43 +760,43 @@ x264_pixel_satd_16x16_mmxext: LOAD_DIFF_HADAMARD_SUM mm0, 0, 1 LOAD_DIFF_HADAMARD_SUM mm1, 0, 1 LOAD_DIFF_HADAMARD_SUM mm2, 0, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 0, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 0, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - paddw mm1, mm2 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 1 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 4, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 4, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 4, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm1, 8, 1 LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 - paddw mm1, mm2 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 8, 1 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 8, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 8, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 8, 0 paddw mm0, mm2 mov eax, [esp+ 8] ; pix1 mov ecx, [esp+16] ; pix2 - LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 - paddw mm1, mm2 + paddw mm0, mm1 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 1 + paddw mm0, mm2 LOAD_DIFF_HADAMARD_SUM mm2, 12, 1 - LOAD_DIFF_HADAMARD_SUM mm3, 12, 0 paddw mm0, mm1 - paddw mm2, mm3 + LOAD_DIFF_HADAMARD_SUM mm1, 12, 0 paddw mm0, mm2 + paddw mm0, mm1 pxor mm3, mm3 pshufw mm1, mm0, 01001110b -- 2.40.0