pixf->vsad = x264_pixel_vsad_xop;
pixf->asd8 = x264_pixel_asd8_xop;
}
+ if( cpu&X264_CPU_AVX2 )
+ {
+ INIT2( sad, _avx2 );
+ INIT2_NAME( sad_aligned, sad, _avx2 );
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
+DECL_X1( sad, avx2 )
+DECL_X1( sad, avx2_aligned )
DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD_MMX 3
-cglobal pixel_sad_%1x%2, 4,4
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
pxor m0, m0
-%rep %2/%3
+%if %2 == 4
SAD_INC_%3x%1P_MMX
-%endrep
+ SAD_INC_%3x%1P_MMX
+%else
+ mov r4d, %2/%3
+.loop:
+ SAD_INC_%3x%1P_MMX
+ dec r4d
+ jg .loop
+%endif
%if %1*%2 == 256
HADDUW m0, m1
%else
; SAD XMM
;=============================================================================
-%macro SAD_INC_2x16P_XMM 0
+%macro SAD_INC_2ROW 1
+%if 2*%1 > mmsize
movu m1, [r2+ 0]
movu m2, [r2+16]
movu m3, [r2+2*r3+ 0]
paddw m3, m4
paddw m0, m1
paddw m0, m3
-%endmacro
-
-%macro SAD_INC_2x8P_XMM 0
+%else
movu m1, [r2]
movu m2, [r2+2*r3]
psubw m1, [r0]
lea r2, [r2+4*r3]
paddw m0, m1
paddw m0, m2
+%endif
%endmacro
;-----------------------------------------------------------------------------
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
-%macro SAD_XMM 2
-cglobal pixel_sad_%1x%2, 4,4,8
+%macro SAD 2
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
pxor m0, m0
-%rep %2/2
- SAD_INC_2x%1P_XMM
-%endrep
+%if %2 == 4
+ SAD_INC_2ROW %1
+ SAD_INC_2ROW %1
+%else
+ mov r4d, %2/2
+.loop:
+ SAD_INC_2ROW %1
+ dec r4d
+ jg .loop
+%endif
HADDW m0, m1
- movd eax, m0
+ movd eax, xm0
RET
%endmacro
INIT_XMM sse2
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
-SAD_XMM 8, 4
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+SAD 8, 4
INIT_XMM sse2, aligned
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
INIT_XMM ssse3
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
-SAD_XMM 8, 4
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+SAD 8, 4
INIT_XMM ssse3, aligned
-SAD_XMM 16, 16
-SAD_XMM 16, 8
-SAD_XMM 8, 16
-SAD_XMM 8, 8
+SAD 16, 16
+SAD 16, 8
+SAD 8, 16
+SAD 8, 8
+INIT_YMM avx2
+SAD 16, 16
+SAD 16, 8
+INIT_YMM avx2, aligned
+SAD 16, 16
+SAD 16, 8
;=============================================================================
; SAD x3/x4