cglobal x264_pixel_satd_16x8_sse2
cglobal x264_pixel_satd_8x16_sse2
cglobal x264_pixel_satd_16x16_sse2
+cglobal x264_pixel_sa8d_8x8_sse2
+cglobal x264_pixel_sa8d_16x16_sse2
%macro SAD_INC_4x16P_SSE2 0
movdqu xmm1, [rdx]
SATD_END
+
+%macro LOAD_DIFF_8P 4 ; MMP, MMT, [pix1], [pix2]
+ movq %1, %3
+ movq %2, %4
+ punpcklbw %1, %2
+ punpcklbw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro SBUTTERFLY 5
+ mov%1 %5, %3
+ punpckl%2 %3, %4
+ punpckh%2 %5, %4
+%endmacro
+
+;-----------------------------------------------------------------------------
+; input ABCDEFGH output AFHDTECB
+;-----------------------------------------------------------------------------
+%macro TRANSPOSE8x8 9
+ SBUTTERFLY dqa, wd, %1, %2, %9
+ SBUTTERFLY dqa, wd, %3, %4, %2
+ SBUTTERFLY dqa, wd, %5, %6, %4
+ SBUTTERFLY dqa, wd, %7, %8, %6
+ SBUTTERFLY dqa, dq, %1, %3, %8
+ SBUTTERFLY dqa, dq, %9, %2, %3
+ SBUTTERFLY dqa, dq, %5, %7, %2
+ SBUTTERFLY dqa, dq, %4, %6, %7
+ SBUTTERFLY dqa, qdq, %1, %5, %6
+ SBUTTERFLY dqa, qdq, %9, %4, %5
+ SBUTTERFLY dqa, qdq, %8, %2, %4
+ SBUTTERFLY dqa, qdq, %3, %7, %2
+%endmacro
+
+%macro SUMSUB_BADC 4
+ paddw %1, %2
+ paddw %3, %4
+ paddw %2, %2
+ paddw %4, %4
+ psubw %2, %1
+ psubw %4, %3
+%endmacro
+
+%macro HADAMARD1x8 8
+ SUMSUB_BADC %1, %5, %2, %6
+ SUMSUB_BADC %3, %7, %4, %8
+ SUMSUB_BADC %1, %3, %2, %4
+ SUMSUB_BADC %5, %7, %6, %8
+ SUMSUB_BADC %1, %2, %3, %4
+ SUMSUB_BADC %5, %6, %7, %8
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sa8d_8x8_sse2:
+ lea r10, [3*parm2q]
+ lea r11, [3*parm4q]
+ LOAD_DIFF_8P xmm0, xmm8, [parm1q], [parm3q]
+ LOAD_DIFF_8P xmm1, xmm9, [parm1q+parm2q], [parm3q+parm4q]
+ LOAD_DIFF_8P xmm2, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
+ LOAD_DIFF_8P xmm3, xmm9, [parm1q+r10], [parm3q+r11]
+ lea parm1q, [parm1q+4*parm2q]
+ lea parm3q, [parm3q+4*parm4q]
+ LOAD_DIFF_8P xmm4, xmm8, [parm1q], [parm3q]
+ LOAD_DIFF_8P xmm5, xmm9, [parm1q+parm2q], [parm3q+parm4q]
+ LOAD_DIFF_8P xmm6, xmm8, [parm1q+2*parm2q], [parm3q+2*parm4q]
+ LOAD_DIFF_8P xmm7, xmm9, [parm1q+r10], [parm3q+r11]
+
+ HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ TRANSPOSE8x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+ HADAMARD1x8 xmm0, xmm5, xmm7, xmm3, xmm8, xmm4, xmm2, xmm1
+
+ pxor xmm10, xmm10
+ SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
+ SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
+ SUM_MM_SSE2 xmm10, xmm0
+ add r8d, eax ; preserve rounding for 16x16
+ add eax, 1
+ shr eax, 1
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+;; violates calling convention
+x264_pixel_sa8d_16x16_sse2:
+ xor r8d, r8d
+ call x264_pixel_sa8d_8x8_sse2 ; pix[0]
+ lea parm1q, [parm1q+4*parm2q]
+ lea parm3q, [parm3q+4*parm4q]
+ call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride]
+ lea r10, [3*parm2q-2]
+ lea r11, [3*parm4q-2]
+ shl r10, 2
+ shl r11, 2
+ sub parm1q, r10
+ sub parm3q, r11
+ call x264_pixel_sa8d_8x8_sse2 ; pix[8]
+ lea parm1q, [parm1q+4*parm2q]
+ lea parm3q, [parm3q+4*parm4q]
+ call x264_pixel_sa8d_8x8_sse2 ; pix[8*stride+8]
+ mov eax, r8d
+ add eax, 1
+ shr eax, 1
+ ret
; satd
-%macro HADAMARD4_SUB_BADC 4
+%macro SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
%endmacro
%macro HADAMARD4x4 4
- HADAMARD4_SUB_BADC %1, %2, %3, %4
- HADAMARD4_SUB_BADC %1, %3, %2, %4
+ SUMSUB_BADC %1, %2, %3, %4
+ SUMSUB_BADC %1, %3, %2, %4
%endmacro
%macro SBUTTERFLYwd 3
SBUTTERFLYdq %5, %2, %3
%endmacro
+%macro MMX_ABS 2 ; mma, tmp
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+%endmacro
+
%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1
pxor %3, %3
pxor %4, %4
pavgw %1, mm6
%endmacro
-%macro LOAD_DIFF_4P 3 ; mmp, dx, dy
- movd %1, [eax+ebx*%3+%2]
- movd mm3, [ecx+edx*%3+%2]
- punpcklbw %1, mm3
- punpcklbw mm3, mm3
- psubw %1, mm3
+%macro LOAD_DIFF_4P 4 ; mmp, mmt, dx, dy
+ movd %1, [eax+ebx*%4+%3]
+ movd %2, [ecx+edx*%4+%3]
+ punpcklbw %1, %2
+ punpcklbw %2, %2
+ psubw %1, %2
%endmacro
; in: %2 = horizontal offset
; out: %1 = satd
%macro LOAD_DIFF_HADAMARD_SUM 3
%if %3
- LOAD_DIFF_4P mm4, %2, 0
- LOAD_DIFF_4P mm5, %2, 1
+ LOAD_DIFF_4P mm4, mm3, %2, 0
+ LOAD_DIFF_4P mm5, mm3, %2, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- LOAD_DIFF_4P mm6, %2, 0
- LOAD_DIFF_4P mm7, %2, 1
+ LOAD_DIFF_4P mm6, mm3, %2, 0
+ LOAD_DIFF_4P mm7, mm3, %2, 1
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
%else
- LOAD_DIFF_4P mm4, %2, 0
- LOAD_DIFF_4P mm6, %2, 2
+ LOAD_DIFF_4P mm4, mm3, %2, 0
+ LOAD_DIFF_4P mm6, mm3, %2, 2
add eax, ebx
add ecx, edx
- LOAD_DIFF_4P mm5, %2, 0
- LOAD_DIFF_4P mm7, %2, 2
+ LOAD_DIFF_4P mm5, mm3, %2, 0
+ LOAD_DIFF_4P mm7, mm3, %2, 2
%endif
HADAMARD4x4_SUM %1
%endmacro
cglobal x264_pixel_satd_8x16_mmxext
cglobal x264_pixel_satd_16x16_mmxext
+cglobal x264_pixel_sa8d_16x16_mmxext
+cglobal x264_pixel_sa8d_8x8_mmxext
+
%macro SAD_START 0
push ebx
pop ebx
ret
+
+%macro LOAD_DIFF_4x8P 1 ; dx
+ LOAD_DIFF_4P mm0, mm7, %1, 0
+ LOAD_DIFF_4P mm1, mm7, %1, 1
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ LOAD_DIFF_4P mm2, mm7, %1, 0
+ LOAD_DIFF_4P mm3, mm7, %1, 1
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ LOAD_DIFF_4P mm4, mm7, %1, 0
+ LOAD_DIFF_4P mm5, mm7, %1, 1
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ LOAD_DIFF_4P mm6, mm7, %1, 0
+ movq [spill], mm6
+ LOAD_DIFF_4P mm7, mm6, %1, 1
+ movq mm6, [spill]
+%endmacro
+
+%macro HADAMARD1x8 8
+ SUMSUB_BADC %1, %5, %2, %6
+ SUMSUB_BADC %3, %7, %4, %8
+ SUMSUB_BADC %1, %3, %2, %4
+ SUMSUB_BADC %5, %7, %6, %8
+ SUMSUB_BADC %1, %2, %3, %4
+ SUMSUB_BADC %5, %6, %7, %8
+%endmacro
+
+%macro SUM4x8_MM 0
+ movq [spill], mm7
+ MMX_ABS mm0, mm7
+ MMX_ABS mm1, mm7
+ MMX_ABS mm2, mm7
+ MMX_ABS mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ movq mm7, [spill]
+ MMX_ABS_TWO mm4, mm5, mm2, mm3
+ MMX_ABS_TWO mm6, mm7, mm2, mm3
+ paddw mm4, mm6
+ paddw mm5, mm7
+ paddw mm0, mm4
+ paddw mm1, mm5
+ paddw mm0, mm1
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sa8d_8x8_mmxext:
+ SATD_START
+ sub esp, 0x68
+%define args esp+0x6c
+%define spill esp+0x60
+ LOAD_DIFF_4x8P 0
+ HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm0
+ TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 ; abcd-t -> adtc
+ movq [esp+0x00], mm4
+ movq [esp+0x08], mm7
+ movq [esp+0x10], mm0
+ movq [esp+0x18], mm6
+ movq mm0, [spill]
+ TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+ movq [esp+0x20], mm0
+ movq [esp+0x28], mm3
+ movq [esp+0x30], mm4
+ movq [esp+0x38], mm2
+
+ mov eax, [args+4]
+ mov ecx, [args+12]
+ LOAD_DIFF_4x8P 4
+ HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm4
+ TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+ movq [esp+0x40], mm0
+ movq [esp+0x48], mm3
+ movq [esp+0x50], mm4
+ movq [esp+0x58], mm2
+ movq mm4, [spill]
+ TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
+ movq mm5, [esp+0x00]
+ movq mm1, [esp+0x08]
+ movq mm2, [esp+0x10]
+ movq mm3, [esp+0x18]
+
+ HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
+ SUM4x8_MM
+ movq [esp], mm0
+
+ movq mm0, [esp+0x20]
+ movq mm1, [esp+0x28]
+ movq mm2, [esp+0x30]
+ movq mm3, [esp+0x38]
+ movq mm4, [esp+0x40]
+ movq mm5, [esp+0x48]
+ movq mm6, [esp+0x50]
+ movq mm7, [esp+0x58]
+
+ HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+ SUM4x8_MM
+
+ pavgw mm0, [esp]
+ pshufw mm1, mm0, 01001110b
+ paddw mm0, mm1
+ pshufw mm1, mm0, 10110001b
+ paddw mm0, mm1
+ movd eax, mm0
+ and eax, 0xffff
+ mov ecx, eax ; preserve rounding for 16x16
+ add eax, 1
+ shr eax, 1
+ add esp, 0x68
+ pop ebx
+ ret
+%undef args
+%undef spill
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+;; violates calling convention
+x264_pixel_sa8d_16x16_mmxext:
+ push esi
+ push edi
+ push ebp
+ mov esi, [esp+28] ; stride2
+ mov edi, [esp+20] ; stride1
+ push esi
+ push dword [esp+28] ; pix2
+ push edi
+ push dword [esp+28] ; pix1
+ call x264_pixel_sa8d_8x8_mmxext
+ mov ebp, ecx
+ shl edi, 3
+ shl esi, 3
+ add [esp+0], edi ; pix1+8*stride1
+ add [esp+8], esi ; pix2+8*stride2
+ call x264_pixel_sa8d_8x8_mmxext
+ add ebp, ecx
+ add dword [esp+0], 8 ; pix1+8*stride1+8
+ add dword [esp+8], 8 ; pix2+8*stride2+8
+ call x264_pixel_sa8d_8x8_mmxext
+ add ebp, ecx
+ sub [esp+0], edi ; pix1+8
+ sub [esp+8], esi ; pix2+8
+ call x264_pixel_sa8d_8x8_mmxext
+ lea eax, [ebp+ecx+1]
+ shr eax, 1
+ add esp, 16
+ pop ebp
+ pop edi
+ pop esi
+ ret
int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+
int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
+
#endif
pixf->sad_pde[PIXEL_16x16] = x264_pixel_sad_pde_16x16_mmxext;
pixf->sad_pde[PIXEL_16x8 ] = x264_pixel_sad_pde_16x8_mmxext;
pixf->sad_pde[PIXEL_8x16 ] = x264_pixel_sad_pde_8x16_mmxext;
+
+#ifdef ARCH_X86
+ pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
+#endif
}
#endif
pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_sse2;
pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_sse2;
-#ifndef ARCH_X86_64
+#ifdef ARCH_X86
pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_sse2;
pixf->sad_x3[PIXEL_16x8 ] = x264_pixel_sad_x3_16x8_sse2;
{
pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_sse2;
pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_sse2;
+
+#ifdef ARCH_X86_64
+ pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
+#endif
}
#endif
TEST_PIXEL( sad );
TEST_PIXEL( ssd );
TEST_PIXEL( satd );
+ TEST_PIXEL( sa8d );
#define TEST_PIXEL_X( N ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \