INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT8( ssd, _mmx2 );
- INIT_ADS( _mmx2 );
pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
INIT2_NAME( sad_aligned, sad, _avx2 );
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
+#if ARCH_X86 || !defined( __MACH__ )
+ INIT_ADS( _avx2 );
+#endif
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
shl r2d, 1
%endmacro
-%macro ADS_END 1 ; unroll_size
- add r1, 8*%1
- add r3, 8*%1
- add r6, 4*%1
- sub r0d, 4*%1
- jg .loop
- WIN64_RESTORE_XMM
+%macro ADS_END 1-2 .loop ; unroll_size, loop_label
+ add r1, 2*%1
+ add r3, 2*%1
+ add r6, %1
+ sub r0d, %1
+ jg %2
+ WIN64_RESTORE_XMM_INTERNAL
%if mmsize==32
vzeroupper
%endif
; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_ads4, 5,7
- mova m6, [r0]
- mova m4, [r0+8]
- pshufw m7, m6, 0
- pshufw m6, m6, q2222
- pshufw m5, m4, 0
- pshufw m4, m4, q2222
+%if HIGH_BIT_DEPTH
+
+%macro ADS_XMM 0
+%if ARCH_X86_64
+cglobal pixel_ads4, 5,7,9
+%else
+cglobal pixel_ads4, 5,7,8
+%endif
+%if mmsize >= 32
+ vpbroadcastd m7, [r0+ 0]
+ vpbroadcastd m6, [r0+ 4]
+ vpbroadcastd m5, [r0+ 8]
+ vpbroadcastd m4, [r0+12]
+%else
+ mova m4, [r0]
+ pshufd m7, m4, 0
+ pshufd m6, m4, q1111
+ pshufd m5, m4, q2222
+ pshufd m4, m4, q3333
+%endif
+%if ARCH_X86_64
+ SPLATD m8, r6m
+%endif
ADS_START
.loop:
- movu m0, [r1]
- movu m1, [r1+16]
- psubw m0, m7
- psubw m1, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- movu m2, [r1+r2]
- movu m3, [r1+r2+16]
- psubw m2, m5
- psubw m3, m4
- paddw m0, m1
- ABSW m2, m2, m1
- ABSW m3, m3, m1
- paddw m0, m2
- paddw m0, m3
- pshufw m1, r6m, 0
- paddusw m0, [r3]
- psubusw m1, m0
- packsswb m1, m1
+%if cpuflag(avx)
+ pmovzxwd m0, [r1]
+ pmovzxwd m1, [r1+16]
+%else
+ movh m0, [r1]
+ movh m1, [r1+16]
+ pxor m3, m3
+ punpcklwd m0, m3
+ punpcklwd m1, m3
+%endif
+ psubd m0, m7
+ psubd m1, m6
+ ABSD m0, m0, m2
+ ABSD m1, m1, m3
+%if cpuflag(avx)
+ pmovzxwd m2, [r1+r2]
+ pmovzxwd m3, [r1+r2+16]
+ paddd m0, m1
+%else
+ movh m2, [r1+r2]
+ movh m3, [r1+r2+16]
+ paddd m0, m1
+ pxor m1, m1
+ punpcklwd m2, m1
+ punpcklwd m3, m1
+%endif
+ psubd m2, m5
+ psubd m3, m4
+ ABSD m2, m2, m1
+ ABSD m3, m3, m1
+ paddd m0, m2
+ paddd m0, m3
+%if cpuflag(avx)
+ pmovzxwd m1, [r3]
+%else
+ movh m1, [r3]
+ pxor m3, m3
+ punpcklwd m1, m3
+%endif
+ paddd m0, m1
+%if ARCH_X86_64
+ psubd m1, m8, m0
+%else
+ SPLATD m1, r6m
+ psubd m1, m0
+%endif
+ packssdw m1, m1
+%if mmsize == 32
+ vpermq m1, m1, q3120
+ packuswb m1, m1
+ movq [r6], xm1
+%else
+ packuswb m1, m1
movd [r6], m1
- ADS_END 1
+%endif
+ ADS_END mmsize/4
-cglobal pixel_ads2, 5,7
- mova m6, [r0]
- pshufw m5, r6m, 0
- pshufw m7, m6, 0
- pshufw m6, m6, q2222
+cglobal pixel_ads2, 5,7,8
+%if mmsize >= 32
+ vpbroadcastd m7, [r0+0]
+ vpbroadcastd m6, [r0+4]
+ vpbroadcastd m5, r6m
+%else
+ movq m6, [r0]
+ movd m5, r6m
+ pshufd m7, m6, 0
+ pshufd m6, m6, q1111
+ pshufd m5, m5, 0
+%endif
+ pxor m4, m4
ADS_START
.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- psubw m0, m7
- psubw m1, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- paddw m0, m1
- paddusw m0, [r3]
- mova m4, m5
- psubusw m4, m0
- packsswb m4, m4
- movd [r6], m4
- ADS_END 1
-
-cglobal pixel_ads1, 5,7
- pshufw m7, [r0], 0
- pshufw m6, r6m, 0
+%if cpuflag(avx)
+ pmovzxwd m0, [r1]
+ pmovzxwd m1, [r1+r2]
+ pmovzxwd m2, [r3]
+%else
+ movh m0, [r1]
+ movh m1, [r1+r2]
+ movh m2, [r3]
+ punpcklwd m0, m4
+ punpcklwd m1, m4
+ punpcklwd m2, m4
+%endif
+ psubd m0, m7
+ psubd m1, m6
+ ABSD m0, m0, m3
+ ABSD m1, m1, m3
+ paddd m0, m1
+ paddd m0, m2
+ psubd m1, m5, m0
+ packssdw m1, m1
+%if mmsize == 32
+ vpermq m1, m1, q3120
+ packuswb m1, m1
+ movq [r6], xm1
+%else
+ packuswb m1, m1
+ movd [r6], m1
+%endif
+ ADS_END mmsize/4
+
+cglobal pixel_ads1, 5,7,8
+%if mmsize >= 32
+ vpbroadcastd m7, [r0]
+ vpbroadcastd m6, r6m
+%else
+ movd m7, [r0]
+ movd m6, r6m
+ pshufd m7, m7, 0
+ pshufd m6, m6, 0
+%endif
+ pxor m5, m5
ADS_START
.loop:
- movu m0, [r1]
- movu m1, [r1+8]
- psubw m0, m7
- psubw m1, m7
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- paddusw m0, [r3]
- paddusw m1, [r3+8]
- mova m4, m6
- mova m5, m6
- psubusw m4, m0
- psubusw m5, m1
- packsswb m4, m5
- mova [r6], m4
- ADS_END 2
+ movu m1, [r1]
+ movu m3, [r3]
+ punpcklwd m0, m1, m5
+ punpckhwd m1, m5
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ psubd m0, m7
+ psubd m1, m7
+ ABSD m0, m0, m4
+ ABSD m1, m1, m4
+ paddd m0, m2
+ paddd m1, m3
+ psubd m2, m6, m0
+ psubd m3, m6, m1
+ packssdw m2, m3
+ packuswb m2, m2
+%if mmsize == 32
+ vpermq m2, m2, q3120
+ mova [r6], xm2
+%else
+ movq [r6], m2
+%endif
+ ADS_END mmsize/2
+%endmacro
+
+INIT_XMM sse2
+ADS_XMM
+INIT_XMM ssse3
+ADS_XMM
+INIT_XMM avx
+ADS_XMM
+INIT_YMM avx2
+ADS_XMM
+
+%else ; !HIGH_BIT_DEPTH
%macro ADS_XMM 0
-%if mmsize==32
+%if ARCH_X86_64 && mmsize == 16
+cglobal pixel_ads4, 5,7,12
+%elif ARCH_X86_64 && mmsize != 8
+cglobal pixel_ads4, 5,7,9
+%else
cglobal pixel_ads4, 5,7,8
+%endif
+ test dword r6m, 0xffff0000
+%if mmsize >= 32
vpbroadcastw m7, [r0+ 0]
vpbroadcastw m6, [r0+ 4]
vpbroadcastw m5, [r0+ 8]
vpbroadcastw m4, [r0+12]
-%else
-cglobal pixel_ads4, 5,7,12
- mova m4, [r0]
- pshuflw m7, m4, q0000
- pshuflw m6, m4, q2222
- pshufhw m5, m4, q0000
- pshufhw m4, m4, q2222
+%elif mmsize == 16
+ mova m4, [r0]
+ pshuflw m7, m4, 0
+ pshuflw m6, m4, q2222
+ pshufhw m5, m4, 0
+ pshufhw m4, m4, q2222
punpcklqdq m7, m7
punpcklqdq m6, m6
punpckhqdq m5, m5
punpckhqdq m4, m4
+%else
+ mova m6, [r0]
+ mova m4, [r0+8]
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
+ pshufw m5, m4, 0
+ pshufw m4, m4, q2222
%endif
-%if ARCH_X86_64 && mmsize == 16
- movd m8, r6m
- SPLATW m8, m8
+ jnz .nz
ADS_START
+%if ARCH_X86_64 && mmsize == 16
movu m10, [r1]
movu m11, [r1+r2]
+ SPLATW m8, r6m
.loop:
psubw m0, m10, m7
movu m10, [r1+16]
paddusw m0, m9
psubusw m1, m8, m0
%else
- ADS_START
+%if ARCH_X86_64 && mmsize != 8
+ SPLATW m8, r6m
+%endif
.loop:
movu m0, [r1]
movu m1, [r1+16]
paddw m0, m2
paddw m0, m3
movu m2, [r3]
-%if mmsize==32
- vpbroadcastw m1, r6m
+%if ARCH_X86_64 && mmsize != 8
+ mova m1, m8
%else
- movd m1, r6m
- pshuflw m1, m1, 0
- punpcklqdq m1, m1
+ SPLATW m1, r6m
%endif
paddusw m0, m2
psubusw m1, m0
%endif ; ARCH
packsswb m1, m1
-%if mmsize==32
+%if mmsize == 32
vpermq m1, m1, q3120
mova [r6], xm1
%else
movh [r6], m1
%endif
- ADS_END mmsize/8
+ ADS_END mmsize/2
+.nz:
+ ADS_START
+%if ARCH_X86_64 && mmsize == 16
+ movu m10, [r1]
+ movu m11, [r1+r2]
+ SPLATD m8, r6m
+.loop_nz:
+ psubw m0, m10, m7
+ movu m10, [r1+16]
+ psubw m1, m10, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ psubw m2, m11, m5
+ movu m11, [r1+r2+16]
+ paddw m0, m1
+ psubw m3, m11, m4
+ movu m9, [r3]
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ pxor m3, m3
+ mova m2, m0
+ mova m1, m9
+ punpcklwd m0, m3
+ punpcklwd m9, m3
+ punpckhwd m2, m3
+ punpckhwd m1, m3
+ paddd m0, m9
+ paddd m2, m1
+ psubd m1, m8, m0
+ psubd m3, m8, m2
+ packssdw m1, m3
+ packuswb m1, m1
+%else
+%if ARCH_X86_64 && mmsize != 8
+ SPLATD m8, r6m
+%endif
+.loop_nz:
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+%if mmsize == 32
+ movu m1, [r3]
+%else
+ movh m1, [r3]
+%endif
+ pxor m3, m3
+ mova m2, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m3
+ punpckhwd m2, m3
+ paddd m0, m1
+%if mmsize == 32
+ movu m1, [r3]
+ punpckhwd m1, m3
+%else
+ movh m1, [r3+mmsize/2]
+ punpcklwd m1, m3
+%endif
+ paddd m2, m1
+%if ARCH_X86_64 && mmsize != 8
+ mova m1, m8
+%else
+ SPLATD m1, r6m
+%endif
+ mova m3, m1
+ psubd m1, m0
+ psubd m3, m2
+ packssdw m1, m3
+ packuswb m1, m1
+%endif ; ARCH
+%if mmsize == 32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/2, .loop_nz
cglobal pixel_ads2, 5,7,8
-%if mmsize==32
+ test dword r6m, 0xffff0000
+%if mmsize >= 32
vpbroadcastw m7, [r0+0]
vpbroadcastw m6, [r0+4]
- vpbroadcastw m5, r6m
-%else
- movq m6, [r0]
- movd m5, r6m
- pshuflw m7, m6, 0
- pshuflw m6, m6, q2222
- pshuflw m5, m5, 0
+%elif mmsize == 16
+ movq m6, [r0]
+ pshuflw m7, m6, 0
+ pshuflw m6, m6, q2222
punpcklqdq m7, m7
punpcklqdq m6, m6
- punpcklqdq m5, m5
+%else
+ mova m6, [r0]
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
%endif
+ jnz .nz
ADS_START
+ SPLATW m5, r6m
.loop:
movu m0, [r1]
movu m1, [r1+r2]
+ movu m2, [r3]
psubw m0, m7
psubw m1, m6
- movu m4, [r3]
- ABSW m0, m0, m2
- ABSW m1, m1, m3
+ ABSW m0, m0, m3
+ ABSW m1, m1, m4
paddw m0, m1
- paddusw m0, m4
+ paddusw m0, m2
psubusw m1, m5, m0
packsswb m1, m1
-%if mmsize==32
+%if mmsize == 32
vpermq m1, m1, q3120
mova [r6], xm1
%else
movh [r6], m1
%endif
- ADS_END mmsize/8
-
-cglobal pixel_ads1, 5,7,8
-%if mmsize==32
- vpbroadcastw m7, [r0]
- vpbroadcastw m6, r6m
+ ADS_END mmsize/2
+.nz:
+ ADS_START
+ SPLATD m5, r6m
+ pxor m4, m4
+.loop_nz:
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ movu m2, [r3]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m3
+ ABSW m1, m1, m3
+ paddw m0, m1
+ punpckhwd m3, m2, m4
+ punpckhwd m1, m0, m4
+ punpcklwd m2, m4
+ punpcklwd m0, m4
+ paddd m1, m3
+ paddd m0, m2
+ psubd m3, m5, m1
+ psubd m2, m5, m0
+ packssdw m2, m3
+ packuswb m2, m2
+%if mmsize == 32
+ vpermq m2, m2, q3120
+ mova [r6], xm2
%else
- movd m7, [r0]
- movd m6, r6m
- pshuflw m7, m7, 0
- pshuflw m6, m6, 0
- punpcklqdq m7, m7
- punpcklqdq m6, m6
+ movh [r6], m2
%endif
+ ADS_END mmsize/2, .loop_nz
+
+cglobal pixel_ads1, 5,7,8
+ test dword r6m, 0xffff0000
+ SPLATW m7, [r0]
+ jnz .nz
ADS_START
+ SPLATW m6, r6m
.loop:
movu m0, [r1]
movu m1, [r1+mmsize]
- psubw m0, m7
- psubw m1, m7
movu m2, [r3]
movu m3, [r3+mmsize]
+ psubw m0, m7
+ psubw m1, m7
ABSW m0, m0, m4
ABSW m1, m1, m5
paddusw m0, m2
psubusw m4, m6, m0
psubusw m5, m6, m1
packsswb m4, m5
-%if mmsize==32
+%if mmsize == 32
vpermq m4, m4, q3120
%endif
mova [r6], m4
- ADS_END mmsize/4
+ ADS_END mmsize
+.nz:
+ ADS_START
+ SPLATD m6, r6m
+ pxor m5, m5
+.loop_nz:
+ movu m0, [r1]
+ movu m1, [r1+mmsize]
+ movu m2, [r3]
+ psubw m0, m7
+ psubw m1, m7
+ ABSW m0, m0, m3
+ ABSW m1, m1, m4
+ punpckhwd m3, m2, m5
+ punpckhwd m4, m0, m5
+ punpcklwd m2, m5
+ punpcklwd m0, m5
+ paddd m4, m3
+ paddd m0, m2
+ psubd m3, m6, m4
+ movu m4, [r3+mmsize]
+ psubd m2, m6, m0
+ packssdw m2, m3
+ punpckhwd m0, m1, m5
+ punpckhwd m3, m4, m5
+ punpcklwd m1, m5
+ punpcklwd m4, m5
+ paddd m0, m3
+ paddd m1, m4
+ psubd m3, m6, m0
+ psubd m4, m6, m1
+ packssdw m4, m3
+ packuswb m2, m4
+%if mmsize == 32
+ vpermq m2, m2, q3120
+%endif
+ mova [r6], m2
+ ADS_END mmsize, .loop_nz
%endmacro
+INIT_MMX mmx2
+ADS_XMM
INIT_XMM sse2
ADS_XMM
INIT_XMM ssse3
INIT_YMM avx2
ADS_XMM
+%endif ; HIGH_BIT_DEPTH
+
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
test r2, r2
%else
mov r3, r2
- add r3d, [r6+r1+4]
+ or r3d, [r6+r1+4]
%endif
jz .loopi0
xor r3d, r3d
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
+ %macro WIN64_RESTORE_XMM_INTERNAL 0
+ %endmacro
%macro WIN64_RESTORE_XMM 0
%endmacro
%macro WIN64_PUSH_XMM 0
%if cpuflag(avx2) && %3 == 0
vpbroadcastw %1, %2
%else
- PSHUFLW %1, %2, (%3)*q1111
-%if mmsize == 16
+ %define %%s %2
+%ifid %2
+ %define %%s xmm%2
+%elif %3 == 0
+ movd xmm%1, %2
+ %define %%s xmm%1
+%endif
+ PSHUFLW xmm%1, %%s, (%3)*q1111
+%if mmsize >= 32
+ vpbroadcastq %1, xmm%1
+%elif mmsize == 16
punpcklqdq %1, %1
%endif
%endif
%endmacro
%imacro SPLATD 2-3 0
-%if mmsize == 16
- pshufd %1, %2, (%3)*q1111
+%if cpuflag(avx2) && %3 == 0
+ vpbroadcastd %1, %2
+%else
+ %define %%s %2
+%ifid %2
+ %define %%s xmm%2
+%elif %3 == 0
+ movd xmm%1, %2
+ %define %%s xmm%1
+%endif
+%if mmsize == 8 && %3 == 0
+%ifidn %1, %%s
+ punpckldq %1, %1
%else
- pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
+ pshufw %1, %%s, q1010
+%endif
+%elif mmsize == 8 && %3 == 1
+%ifidn %1, %%s
+ punpckhdq %1, %1
+%else
+ pshufw %1, %%s, q3232
+%endif
+%else
+ pshufd xmm%1, %%s, (%3)*q1111
+%endif
+%if mmsize >= 32
+ vpbroadcastq %1, xmm%1
+%endif
%endif
%endmacro
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30 // number of different combinations of cpu flags
+// RAND_MAX is guaranteed to be at least 32767, to get 30 bits of random data, we'll call rand() twice
+#define rand30() (((rand() & 0x7fff) << 15) + (rand() & 0x7fff))
+
typedef struct
{
void *pointer; // just for detecting duplicates
ok = 1; used_asm = 0;
for( int i = 0; i < 32; i++ )
- cost_mv[i] = i*10;
+ cost_mv[i] = rand30() & 0xffff;
for( int i = 0; i < 100 && ok; i++ )
if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
{
ALIGNED_16( int16_t mvs_a[48] );
ALIGNED_16( int16_t mvs_c[48] );
int mvn_a, mvn_c;
- int thresh = rand() & 0x3fff;
- set_func_name( "esa_ads" );
- for( int j = 0; j < 72; j++ )
- sums[j] = rand() & 0x3fff;
- for( int j = 0; j < 4; j++ )
- dc[j] = rand() & 0x3fff;
+ int thresh = (rand() % 257) * PIXEL_MAX + (rand30() & 0xffff);
+ set_func_name( "esa_ads_%s", pixel_names[i&3] );
+ if( i < 40 )
+ {
+ for( int j = 0; j < 72; j++ )
+ sums[j] = (rand() % 9) * 8 * PIXEL_MAX;
+ for( int j = 0; j < 4; j++ )
+ dc[j] = (rand() % 9) * 8 * PIXEL_MAX;
+ }
+ else
+ {
+#if BIT_DEPTH + 6 > 15
+ for( int j = 0; j < 72; j++ )
+ sums[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1);
+ for( int j = 0; j < 4; j++ )
+ dc[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1);
+#else
+ for( int j = 0; j < 72; j++ )
+ sums[j] = rand() & ((1 << (BIT_DEPTH + 6))-1);
+ for( int j = 0; j < 4; j++ )
+ dc[j] = rand() & ((1 << (BIT_DEPTH + 6))-1);
+#endif
+ }
used_asm = 1;
mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
{
ok = 0;
+ printf( "thresh: %d\n", thresh );
printf( "c%d: ", i&3 );
for( int j = 0; j < mvn_c; j++ )
printf( "%d ", mvs_c[j] );
x264_emms();
for( int i = 0; i < 10; i++ )
{
- float fps_factor = (rand()&65535) / 65535.0f;
+ float fps_factor = (rand30()&65535) / 65535.0f;
set_func_name( "mbtree_propagate_cost" );
int16_t *dsta = (int16_t*)buf3;
int16_t *dstc = dsta+400;