lea ecx, [ecx+2*edx]
%endmacro
+%macro SSD_INC_1x16P 0
+ movq mm1, [eax]
+ movq mm2, [ecx]
+ movq mm3, [eax+8]
+ movq mm4, [ecx+8]
+
+ movq mm5, mm2
+ movq mm6, mm4
+ psubusb mm2, mm1
+ psubusb mm4, mm3
+ psubusb mm1, mm5
+ psubusb mm3, mm6
+ por mm1, mm2
+ por mm3, mm4
+
+ movq mm2, mm1
+ movq mm4, mm3
+ punpcklbw mm1, mm7
+ punpcklbw mm3, mm7
+ punpckhbw mm2, mm7
+ punpckhbw mm4, mm7
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+ pmaddwd mm4, mm4
+
+ add eax, ebx
+ add ecx, edx
+ paddd mm0, mm1
+ paddd mm0, mm2
+ paddd mm0, mm3
+ paddd mm0, mm4
+%endmacro
+
+%macro SSD_INC_1x8P 0
+ movq mm1, [eax]
+ movq mm2, [ecx]
+
+ movq mm5, mm2
+ psubusb mm2, mm1
+ psubusb mm1, mm5
+ por mm1, mm2 ; mm1 = 8bit abs diff
+
+ movq mm2, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+
+ add eax, ebx
+ add ecx, edx
+ paddd mm0, mm1
+ paddd mm0, mm2
+%endmacro
+
+%macro SSD_INC_1x4P 0
+ movd mm1, [eax]
+ movd mm2, [ecx]
+
+ movq mm5, mm2
+ psubusb mm2, mm1
+ psubusb mm1, mm5
+ por mm1, mm2
+ punpcklbw mm1, mm7
+ pmaddwd mm1, mm1
+
+ add eax, ebx
+ add ecx, edx
+ paddd mm0, mm1
+%endmacro
+
+%macro SSD_INC_8x16P 0
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+%endmacro
+
+%macro SSD_INC_4x8P 0
+ SSD_INC_1x8P
+ SSD_INC_1x8P
+ SSD_INC_1x8P
+ SSD_INC_1x8P
+%endmacro
+
+%macro SSD_INC_4x4P 0
+ SSD_INC_1x4P
+ SSD_INC_1x4P
+ SSD_INC_1x4P
+ SSD_INC_1x4P
+%endmacro
+
%macro LOAD_DIFF_4P 5 ; MMP, MMT, MMZ, [pix1], [pix2]
movd %1, %4
punpcklbw %1, %3
cglobal x264_pixel_sad_4x8_mmxext
cglobal x264_pixel_sad_4x4_mmxext
+cglobal x264_pixel_ssd_16x16_mmxext
+cglobal x264_pixel_ssd_16x8_mmxext
+cglobal x264_pixel_ssd_8x16_mmxext
+cglobal x264_pixel_ssd_8x8_mmxext
+cglobal x264_pixel_ssd_8x4_mmxext
+cglobal x264_pixel_ssd_4x8_mmxext
+cglobal x264_pixel_ssd_4x4_mmxext
+
cglobal x264_pixel_satd_4x4_mmxext
cglobal x264_pixel_satd_4x8_mmxext
cglobal x264_pixel_satd_8x4_mmxext
cglobal x264_pixel_satd_8x16_mmxext
cglobal x264_pixel_satd_16x16_mmxext
-ALIGN 16
-;-----------------------------------------------------------------------------
-; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
-;-----------------------------------------------------------------------------
-x264_pixel_sad_16x16_mmxext:
+%macro SAD_START 0
push ebx
mov eax, [esp+ 8] ; pix1
mov edx, [esp+20] ; stride2
pxor mm0, mm0
+%endmacro
+%macro SAD_END 0
+ movd eax, mm0
+ pop ebx
+ ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_sad_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_sad_16x16_mmxext:
+ SAD_START
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
-
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
-
- movd eax, mm0
-
- pop ebx
- ret
+ SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_16x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_16x8_mmxext:
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor mm0, mm0
-
+ SAD_START
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
SAD_INC_2x16P
-
- movd eax, mm0
-
- pop ebx
- ret
-
+ SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_8x16_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_8x16_mmxext:
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor mm0, mm0
-
+ SAD_START
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
-
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
-
- movd eax, mm0
-
- pop ebx
- ret
+ SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_8x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_8x8_mmxext:
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor mm0, mm0
-
+ SAD_START
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
SAD_INC_2x8P
-
- movd eax, mm0
-
- pop ebx
- ret
+ SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_8x4_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_8x4_mmxext:
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor mm0, mm0
-
+ SAD_START
SAD_INC_2x8P
SAD_INC_2x8P
-
- movd eax, mm0
-
- pop ebx
- ret
-
+ SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_4x8_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_4x8_mmxext:
- push ebx
-
- mov eax, [esp+ 8] ; pix1
- mov ebx, [esp+12] ; stride1
- mov ecx, [esp+16] ; pix2
- mov edx, [esp+20] ; stride2
-
- pxor mm0, mm0
-
+ SAD_START
SAD_INC_2x4P
SAD_INC_2x4P
-
SAD_INC_2x4P
SAD_INC_2x4P
-
- movd eax, mm0
-
- pop ebx
- ret
+ SAD_END
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_4x4_mmxext (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
x264_pixel_sad_4x4_mmxext:
+ SAD_START
+ SAD_INC_2x4P
+ SAD_INC_2x4P
+ SAD_END
+
+
+
+%macro SSD_START 0
push ebx
mov eax, [esp+ 8] ; pix1
mov ecx, [esp+16] ; pix2
mov edx, [esp+20] ; stride2
- pxor mm0, mm0
+ pxor mm7, mm7 ; zero
+ pxor mm0, mm0 ; mm0 holds the sum
+%endmacro
- SAD_INC_2x4P
- SAD_INC_2x4P
-
- movd eax, mm0
+%macro SSD_END 0
+ movq mm1, mm0
+ psrlq mm1, 32
+ paddd mm0, mm1
+ movd eax, mm0
pop ebx
ret
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; int __cdecl x264_pixel_ssd_16x16_mmxext (uint8_t *, int, uint8_t *, int )
+;-----------------------------------------------------------------------------
+x264_pixel_ssd_16x16_mmxext:
+ SSD_START
+ SSD_INC_8x16P
+ SSD_INC_8x16P
+ SSD_END
+
+ALIGN 16
+x264_pixel_ssd_16x8_mmxext:
+ SSD_START
+ SSD_INC_8x16P
+ SSD_END
+
+ALIGN 16
+x264_pixel_ssd_8x16_mmxext:
+ SSD_START
+ SSD_INC_4x8P
+ SSD_INC_4x8P
+ SSD_INC_4x8P
+ SSD_INC_4x8P
+ SSD_END
+
+ALIGN 16
+x264_pixel_ssd_8x8_mmxext:
+ SSD_START
+ SSD_INC_4x8P
+ SSD_INC_4x8P
+ SSD_END
+
+ALIGN 16
+x264_pixel_ssd_8x4_mmxext:
+ SSD_START
+ SSD_INC_4x8P
+ SSD_END
+
+ALIGN 16
+x264_pixel_ssd_4x8_mmxext:
+ SSD_START
+ SSD_INC_4x4P
+ SSD_INC_4x4P
+ SSD_END
+
+ALIGN 16
+x264_pixel_ssd_4x4_mmxext:
+ SSD_START
+ SSD_INC_4x4P
+ SSD_END
+
+
ALIGN 16
;-----------------------------------------------------------------------------
pxor mm7, mm7
-
LOAD_DIFF_4P mm0, mm6, mm7, [eax], [ecx]
LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx], [ecx+edx]
LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*ebx], [ecx+2*edx]
pop ebx
ret
-
-
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_satd_4x8_mmxext (uint8_t *, int, uint8_t *, int )
int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_8x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
+int x264_pixel_ssd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
+
int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
PIXEL_SAD_C( pixel_sad_4x8, 4, 8 )
PIXEL_SAD_C( pixel_sad_4x4, 4, 4 )
+
+/****************************************************************************
+ * pixel_ssd_WxH
+ ****************************************************************************/
+#define PIXEL_SSD_C( name, lx, ly ) \
+static int name( uint8_t *pix1, int i_stride_pix1, \
+ uint8_t *pix2, int i_stride_pix2 ) \
+{ \
+ int i_sum = 0; \
+ int x, y; \
+ for( y = 0; y < ly; y++ ) \
+ { \
+ for( x = 0; x < lx; x++ ) \
+ { \
+ int d = pix1[x] - pix2[x]; \
+ i_sum += d*d; \
+ } \
+ pix1 += i_stride_pix1; \
+ pix2 += i_stride_pix2; \
+ } \
+ return i_sum; \
+}
+
+PIXEL_SSD_C( pixel_ssd_16x16, 16, 16 )
+PIXEL_SSD_C( pixel_ssd_16x8, 16, 8 )
+PIXEL_SSD_C( pixel_ssd_8x16, 8, 16 )
+PIXEL_SSD_C( pixel_ssd_8x8, 8, 8 )
+PIXEL_SSD_C( pixel_ssd_8x4, 8, 4 )
+PIXEL_SSD_C( pixel_ssd_4x8, 4, 8 )
+PIXEL_SSD_C( pixel_ssd_4x4, 4, 4 )
+
+
static void pixel_sub_4x4( int16_t diff[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
int y, x;
pixf->sad[PIXEL_4x8] = pixel_sad_4x8;
pixf->sad[PIXEL_4x4] = pixel_sad_4x4;
+ pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16;
+ pixf->ssd[PIXEL_16x8] = pixel_ssd_16x8;
+ pixf->ssd[PIXEL_8x16] = pixel_ssd_8x16;
+ pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8;
+ pixf->ssd[PIXEL_8x4] = pixel_ssd_8x4;
+ pixf->ssd[PIXEL_4x8] = pixel_ssd_4x8;
+ pixf->ssd[PIXEL_4x4] = pixel_ssd_4x4;
+
pixf->satd[PIXEL_16x16]= pixel_satd_16x16;
pixf->satd[PIXEL_16x8] = pixel_satd_16x8;
pixf->satd[PIXEL_8x16] = pixel_satd_8x16;
pixf->sad[PIXEL_4x8 ] = x264_pixel_sad_4x8_mmxext;
pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_mmxext;
+ pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_mmxext;
+ pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_mmxext;
+ pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_mmxext;
+ pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_mmxext;
+ pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_mmxext;
+ pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_mmxext;
+ pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_mmxext;
+
pixf->satd[PIXEL_16x16]= x264_pixel_satd_16x16_mmxext;
pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_mmxext;
pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_mmxext;