From 6b577361fbab9d785787eba3e16a63a23d84be28 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Mon, 24 Apr 2006 03:52:55 +0000 Subject: [PATCH] mmx implementation of x264_pixel_sa8d git-svn-id: svn://svn.videolan.org/x264/trunk@507 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/pixel-sse2.asm | 109 +++++++++++++++++++ common/i386/pixel-a.asm | 202 +++++++++++++++++++++++++++++++++--- common/i386/pixel.h | 6 ++ common/pixel.c | 12 ++- tools/checkasm.c | 1 + 5 files changed, 312 insertions(+), 18 deletions(-) diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm index 61f4fc8c..f616ecd2 100644 --- a/common/amd64/pixel-sse2.asm +++ b/common/amd64/pixel-sse2.asm @@ -45,6 +45,8 @@ cglobal x264_pixel_satd_8x8_sse2 cglobal x264_pixel_satd_16x8_sse2 cglobal x264_pixel_satd_8x16_sse2 cglobal x264_pixel_satd_16x16_sse2 +cglobal x264_pixel_sa8d_8x8_sse2 +cglobal x264_pixel_sa8d_16x16_sse2 %macro SAD_INC_4x16P_SSE2 0 movdqu xmm1, [rdx] @@ -506,3 +508,110 @@ x264_pixel_satd_8x4_sse2: SATD_END + +%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2] + movq %1, %3 + movq %2, %4 + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 +%endmacro + +%macro SBUTTERFLY 5 + mov%1 %5, %3 + punpckl%2 %3, %4 + punpckh%2 %5, %4 +%endmacro + +;----------------------------------------------------------------------------- +; input ABCDEFGH output AFHDTECB +;----------------------------------------------------------------------------- +%macro TRANSPOSE8x8 9 + SBUTTERFLY dqa, wd, %1, %2, %9 + SBUTTERFLY dqa, wd, %3, %4, %2 + SBUTTERFLY dqa, wd, %5, %6, %4 + SBUTTERFLY dqa, wd, %7, %8, %6 + SBUTTERFLY dqa, dq, %1, %3, %8 + SBUTTERFLY dqa, dq, %9, %2, %3 + SBUTTERFLY dqa, dq, %5, %7, %2 + SBUTTERFLY dqa, dq, %4, %6, %7 + SBUTTERFLY dqa, qdq, %1, %5, %6 + SBUTTERFLY dqa, qdq, %9, %4, %5 + SBUTTERFLY dqa, qdq, %8, %2, %4 + SBUTTERFLY dqa, qdq, %3, %7, %2 +%endmacro + +%macro SUMSUB_BADC 4 + paddw %1, %2 + paddw %3, %4 + paddw %2, %2 + paddw %4, %4 + psubw %2, %1 + psubw %4, %3 +%endmacro + +%macro HADAMARD1x8 8 + SUMSUB_BADC %1, %5, %2, %6 + SUMSUB_BADC %3, %7, %4, %8 + SUMSUB_BADC %1, %3, %2, %4 + SUMSUB_BADC %5, %7, %6, %8 + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %5, %6, %7, %8 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sa8d_8x8_sse2: + lea r10, [3*parm2q] + lea r11, [3*parm4q] + LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q] + LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q] + LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q] + LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11] + lea parm1q, [parm1q+4*parm2q] + lea parm3q, [parm3q+4*parm4q] + LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q] + LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q] + LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q] + LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11] + + HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 + TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 + HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1 + + pxor xmm10, xmm10 + SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10 + SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10 + SUM_MM_SSE2 xmm10, xmm0 + add r8d, eax ; preserve rounding for 16x16 + add eax, 1 + shr eax, 1 + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +;; violates calling convention +x264_pixel_sa8d_16x16_sse2: + xor r8d, r8d + call x264_pixel_sa8d_8x8_sse2 ; pix[0] + lea parm1q, [parm1q+4*parm2q] + lea parm3q, [parm3q+4*parm4q] + call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride] + lea r10, [3*parm2q-2] + lea r11, [3*parm4q-2] + shl r10, 2 + shl r11, 2 + sub parm1q, r10 + sub parm3q, r11 + call x264_pixel_sa8d_8x8_sse2 ; pix[8] + lea parm1q, [parm1q+4*parm2q] + lea parm3q, [parm3q+4*parm4q] + call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8] + mov eax, r8d + add eax, 1 + shr eax, 1 + ret diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index d171c019..834841e8 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -340,7 +340,7 @@ BITS 32 ; satd -%macro HADAMARD4_SUB_BADC 4 +%macro SUMSUB_BADC 4 paddw %1, %2 paddw %3, %4 paddw %2, %2 @@ -350,8 +350,8 @@ BITS 32 %endmacro %macro HADAMARD4x4 4 - HADAMARD4_SUB_BADC %1, %2, %3, %4 - HADAMARD4_SUB_BADC %1, %3, %2, %4 + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %1, %3, %2, %4 %endmacro %macro SBUTTERFLYwd 3 @@ -373,6 +373,12 @@ BITS 32 SBUTTERFLYdq %5, %2, %3 %endmacro +%macro MMX_ABS 2 ; mma, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%endmacro + %macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 pxor %3, %3 pxor %4, %4 @@ -393,12 +399,12 @@ BITS 32 pavgw %1, mm6 %endmacro -%macro LOAD_DIFF_4P 3 ; mmp, dx, dy - movd %1, [eax+ebx*%3+%2] - movd mm3, [ecx+edx*%3+%2] - punpcklbw %1, mm3 - punpcklbw mm3, mm3 - psubw %1, mm3 +%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy + movd %1, [eax+ebx*%4+%3] + movd %2, [ecx+edx*%4+%3] + punpcklbw %1, %2 + punpcklbw %2, %2 + psubw %1, %2 %endmacro ; in: %2 = horizontal offset @@ -407,21 +413,21 @@ BITS 32 ; out: %1 = satd %macro LOAD_DIFF_HADAMARD_SUM 3 %if %3 - LOAD_DIFF_4P mm4, %2, 0 - LOAD_DIFF_4P mm5, %2, 1 + LOAD_DIFF_4P mm4, mm3, %2, 0 + LOAD_DIFF_4P mm5, mm3, %2, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] - LOAD_DIFF_4P mm6, %2, 0 - LOAD_DIFF_4P mm7, %2, 1 + LOAD_DIFF_4P mm6, mm3, %2, 0 + LOAD_DIFF_4P mm7, mm3, %2, 1 lea eax, [eax+2*ebx] lea ecx, [ecx+2*edx] %else - LOAD_DIFF_4P mm4, %2, 0 - LOAD_DIFF_4P mm6, %2, 2 + LOAD_DIFF_4P mm4, mm3, %2, 0 + LOAD_DIFF_4P mm6, mm3, %2, 2 add eax, ebx add ecx, edx - LOAD_DIFF_4P mm5, %2, 0 - LOAD_DIFF_4P mm7, %2, 2 + LOAD_DIFF_4P mm5, mm3, %2, 0 + LOAD_DIFF_4P mm7, mm3, %2, 2 %endif HADAMARD4x4_SUM %1 %endmacro @@ -476,6 +482,9 @@ cglobal x264_pixel_satd_16x8_mmxext cglobal x264_pixel_satd_8x16_mmxext cglobal x264_pixel_satd_16x16_mmxext +cglobal x264_pixel_sa8d_16x16_mmxext +cglobal x264_pixel_sa8d_8x8_mmxext + %macro SAD_START 0 push ebx @@ -808,3 +817,162 @@ x264_pixel_satd_16x16_mmxext: pop ebx ret + +%macro LOAD_DIFF_4x8P 1 ; dx + LOAD_DIFF_4P mm0, mm7, %1, 0 + LOAD_DIFF_4P mm1, mm7, %1, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + LOAD_DIFF_4P mm2, mm7, %1, 0 + LOAD_DIFF_4P mm3, mm7, %1, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + LOAD_DIFF_4P mm4, mm7, %1, 0 + LOAD_DIFF_4P mm5, mm7, %1, 1 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + LOAD_DIFF_4P mm6, mm7, %1, 0 + movq [spill], mm6 + LOAD_DIFF_4P mm7, mm6, %1, 1 + movq mm6, [spill] +%endmacro + +%macro HADAMARD1x8 8 + SUMSUB_BADC %1, %5, %2, %6 + SUMSUB_BADC %3, %7, %4, %8 + SUMSUB_BADC %1, %3, %2, %4 + SUMSUB_BADC %5, %7, %6, %8 + SUMSUB_BADC %1, %2, %3, %4 + SUMSUB_BADC %5, %6, %7, %8 +%endmacro + +%macro SUM4x8_MM 0 + movq [spill], mm7 + MMX_ABS mm0, mm7 + MMX_ABS mm1, mm7 + MMX_ABS mm2, mm7 + MMX_ABS mm3, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + movq mm7, [spill] + MMX_ABS_TWO mm4, mm5, mm2, mm3 + MMX_ABS_TWO mm6, mm7, mm2, mm3 + paddw mm4, mm6 + paddw mm5, mm7 + paddw mm0, mm4 + paddw mm1, mm5 + paddw mm0, mm1 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +x264_pixel_sa8d_8x8_mmxext: + SATD_START + sub esp, 0x68 +%define args esp+0x6c +%define spill esp+0x60 + LOAD_DIFF_4x8P 0 + HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm0 + TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 ; abcd-t -> adtc + movq [esp+0x00], mm4 + movq [esp+0x08], mm7 + movq [esp+0x10], mm0 + movq [esp+0x18], mm6 + movq mm0, [spill] + TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 + movq [esp+0x20], mm0 + movq [esp+0x28], mm3 + movq [esp+0x30], mm4 + movq [esp+0x38], mm2 + + mov eax, [args+4] + mov ecx, [args+12] + LOAD_DIFF_4x8P 4 + HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm4 + TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 + movq [esp+0x40], mm0 + movq [esp+0x48], mm3 + movq [esp+0x50], mm4 + movq [esp+0x58], mm2 + movq mm4, [spill] + TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 + movq mm5, [esp+0x00] + movq mm1, [esp+0x08] + movq mm2, [esp+0x10] + movq mm3, [esp+0x18] + + HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 + SUM4x8_MM + movq [esp], mm0 + + movq mm0, [esp+0x20] + movq mm1, [esp+0x28] + movq mm2, [esp+0x30] + movq mm3, [esp+0x38] + movq mm4, [esp+0x40] + movq mm5, [esp+0x48] + movq mm6, [esp+0x50] + movq mm7, [esp+0x58] + + HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + SUM4x8_MM + + pavgw mm0, [esp] + pshufw mm1, mm0, 01001110b + paddw mm0, mm1 + pshufw mm1, mm0, 10110001b + paddw mm0, mm1 + movd eax, mm0 + and eax, 0xffff + mov ecx, eax ; preserve rounding for 16x16 + add eax, 1 + shr eax, 1 + add esp, 0x68 + pop ebx + ret +%undef args +%undef spill + +ALIGN 16 +;----------------------------------------------------------------------------- +; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int ) +;----------------------------------------------------------------------------- +;; violates calling convention +x264_pixel_sa8d_16x16_mmxext: + push esi + push edi + push ebp + mov esi, [esp+28] ; stride2 + mov edi, [esp+20] ; stride1 + push esi + push dword [esp+28] ; pix2 + push edi + push dword [esp+28] ; pix1 + call x264_pixel_sa8d_8x8_mmxext + mov ebp, ecx + shl edi, 3 + shl esi, 3 + add [esp+0], edi ; pix1+8*stride1 + add [esp+8], esi ; pix2+8*stride2 + call x264_pixel_sa8d_8x8_mmxext + add ebp, ecx + add dword [esp+0], 8 ; pix1+8*stride1+8 + add dword [esp+8], 8 ; pix2+8*stride2+8 + call x264_pixel_sa8d_8x8_mmxext + add ebp, ecx + sub [esp+0], edi ; pix1+8 + sub [esp+8], esi ; pix2+8 + call x264_pixel_sa8d_8x8_mmxext + lea eax, [ebp+ecx+1] + shr eax, 1 + add esp, 16 + pop ebp + pop edi + pop esi + ret diff --git a/common/i386/pixel.h b/common/i386/pixel.h index c0f9f3e4..8baf7c7a 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -67,6 +67,9 @@ int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int ); + int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int ); @@ -84,4 +87,7 @@ int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ); + #endif diff --git a/common/pixel.c b/common/pixel.c index 4e3e7870..d98b7c7e 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -365,6 +365,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext; pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext; pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext; + +#ifdef ARCH_X86 + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; +#endif } #endif @@ -381,7 +386,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_sse2; pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_sse2; -#ifndef ARCH_X86_64 +#ifdef ARCH_X86 pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_sse2; pixf->sad_x3[PIXEL_16x8 ] = x264_pixel_sad_x3_16x8_sse2; @@ -394,6 +399,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) { pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2; pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_sse2; + +#ifdef ARCH_X86_64 + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; +#endif } #endif diff --git a/tools/checkasm.c b/tools/checkasm.c index 680d4706..e91f1bf8 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -60,6 +60,7 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_PIXEL( sad ); TEST_PIXEL( ssd ); TEST_PIXEL( satd ); + TEST_PIXEL( sa8d ); #define TEST_PIXEL_X( N ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ -- 2.40.0