Automatically handle 3-operand instructions and abstraction between SSE and AVX.
Implement one function with this (denoise_dct) as an initial test.
x264 can't make much use of the 256-bit support of AVX (as it's float-only), but 3-operand could give some small benefits.
{"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
+ {"AVX", X264_CPU_AVX},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
+ if( ecx&0x10000000 )
+ cpu |= X264_CPU_AVX;
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
+
+ if( cpu&X264_CPU_AVX )
+ {
+ pf->denoise_dct = x264_denoise_dct_avx;
+ }
#endif // HAVE_MMX
#if HAVE_ALTIVEC
mova m3, [r0+r3*2+1*mmsize]
PABSW m0, m2
PABSW m1, m3
- mova m4, m0
- mova m5, m1
- psubusw m0, [r2+r3*2+0*mmsize]
- psubusw m1, [r2+r3*2+1*mmsize]
- PSIGNW m0, m2
- PSIGNW m1, m3
- mova [r0+r3*2+0*mmsize], m0
- mova [r0+r3*2+1*mmsize], m1
- mova m2, m4
- mova m3, m5
- punpcklwd m4, m6
- punpckhwd m2, m6
- punpcklwd m5, m6
- punpckhwd m3, m6
- paddd m4, [r1+r3*4+0*mmsize]
- paddd m2, [r1+r3*4+1*mmsize]
- paddd m5, [r1+r3*4+2*mmsize]
- paddd m3, [r1+r3*4+3*mmsize]
- mova [r1+r3*4+0*mmsize], m4
- mova [r1+r3*4+1*mmsize], m2
- mova [r1+r3*4+2*mmsize], m5
- mova [r1+r3*4+3*mmsize], m3
+ psubusw m4, m0, [r2+r3*2+0*mmsize]
+ psubusw m5, m1, [r2+r3*2+1*mmsize]
+ PSIGNW m4, m2
+ PSIGNW m5, m3
+ mova [r0+r3*2+0*mmsize], m4
+ mova [r0+r3*2+1*mmsize], m5
+ punpcklwd m2, m0, m6
+ punpcklwd m3, m1, m6
+ punpckhwd m0, m6
+ punpckhwd m1, m6
+ paddd m2, [r1+r3*4+0*mmsize]
+ paddd m0, [r1+r3*4+1*mmsize]
+ paddd m3, [r1+r3*4+2*mmsize]
+ paddd m1, [r1+r3*4+3*mmsize]
+ mova [r1+r3*4+0*mmsize], m2
+ mova [r1+r3*4+1*mmsize], m0
+ mova [r1+r3*4+2*mmsize], m3
+ mova [r1+r3*4+3*mmsize], m1
jg .loop
mov [r0], r4w
RET
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3, 7
+INIT_AVX
+DENOISE_DCT avx, 7
%endif ; !HIGH_BIT_DEPTH
%endmacro
%ifndef ARCH_X86_64
+INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE4x4 15, mmxext, 0, 0
DECIMATE4x4 16, mmxext, 0, 0
DECIMATE4x4 15, mmxext_slowctz, 1, 0
DECIMATE4x4 16, mmxext_slowctz, 1, 0
%endif
+INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE4x4 15, sse2, 0, 0
DECIMATE4x4 16, sse2, 0, 0
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-void x264_denoise_dct_mmx( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
-void x264_denoise_dct_sse2( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
+void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int x264_decimate_score15_mmxext( dctcoef *dct );
int x264_decimate_score15_sse2 ( dctcoef *dct );
int x264_decimate_score15_ssse3 ( dctcoef *dct );
%endmacro
%macro INIT_MMX 0
+ %assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX
%define mmsize 8
%define num_mmregs 8
%endmacro
%macro INIT_XMM 0
+ %assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_XMM
%define mmsize 16
%define num_mmregs 8
%endrep
%endmacro
+%macro INIT_AVX 0
+ INIT_XMM
+ %assign avx_enabled 1
+ %define RESET_MM_PERMUTATION INIT_AVX
+%endmacro
+
INIT_MMX
; I often want to use macros that permute their arguments. e.g. there's no
sub %1, %2
%endif
%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%define sizeofmm0 8
+%define sizeofmm1 8
+%define sizeofmm2 8
+%define sizeofmm3 8
+%define sizeofmm4 8
+%define sizeofmm5 8
+%define sizeofmm6 8
+%define sizeofmm7 8
+%define sizeofxmm0 16
+%define sizeofxmm1 16
+%define sizeofxmm2 16
+%define sizeofxmm3 16
+%define sizeofxmm4 16
+%define sizeofxmm5 16
+%define sizeofxmm6 16
+%define sizeofxmm7 16
+%define sizeofxmm8 16
+%define sizeofxmm9 16
+%define sizeofxmm10 16
+%define sizeofxmm11 16
+%define sizeofxmm12 16
+%define sizeofxmm13 16
+%define sizeofxmm14 16
+%define sizeofxmm15 16
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+ %if sizeof%5==8
+ %define %%regmov movq
+ %elif %2
+ %define %%regmov movaps
+ %else
+ %define %%regmov movdqa
+ %endif
+
+ %if %4>=3+%3
+ %ifnidn %5, %6
+ %if avx_enabled && sizeof%5==16
+ v%1 %5, %6, %7
+ %else
+ %%regmov %5, %6
+ %1 %5, %7
+ %endif
+ %else
+ %1 %5, %7
+ %endif
+ %elif %3
+ %1 %5, %6, %7
+ %else
+ %1 %5, %6
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
+%macro AVX_INSTR 3
+ %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
+ %ifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0
+AVX_INSTR addps, 1, 0
+AVX_INSTR addsd, 1, 0
+AVX_INSTR addss, 1, 0
+AVX_INSTR addsubpd, 1, 0
+AVX_INSTR addsubps, 1, 0
+AVX_INSTR andpd, 1, 0
+AVX_INSTR andps, 1, 0
+AVX_INSTR andnpd, 1, 0
+AVX_INSTR andnps, 1, 0
+AVX_INSTR blendpd, 1, 0
+AVX_INSTR blendps, 1, 0
+AVX_INSTR blendvpd, 1, 0
+AVX_INSTR blendvps, 1, 0
+AVX_INSTR cmppd, 1, 0
+AVX_INSTR cmpps, 1, 0
+AVX_INSTR cmpsd, 1, 0
+AVX_INSTR cmpss, 1, 0
+AVX_INSTR divpd, 1, 0
+AVX_INSTR divps, 1, 0
+AVX_INSTR divsd, 1, 0
+AVX_INSTR divss, 1, 0
+AVX_INSTR dppd, 1, 0
+AVX_INSTR dpps, 1, 0
+AVX_INSTR haddpd, 1, 0
+AVX_INSTR haddps, 1, 0
+AVX_INSTR hsubpd, 1, 0
+AVX_INSTR hsubps, 1, 0
+AVX_INSTR maxpd, 1, 0
+AVX_INSTR maxps, 1, 0
+AVX_INSTR maxsd, 1, 0
+AVX_INSTR maxss, 1, 0
+AVX_INSTR minpd, 1, 0
+AVX_INSTR minps, 1, 0
+AVX_INSTR minsd, 1, 0
+AVX_INSTR minss, 1, 0
+AVX_INSTR mpsadbw, 0, 1
+AVX_INSTR mulpd, 1, 0
+AVX_INSTR mulps, 1, 0
+AVX_INSTR mulsd, 1, 0
+AVX_INSTR mulss, 1, 0
+AVX_INSTR orpd, 1, 0
+AVX_INSTR orps, 1, 0
+AVX_INSTR packsswb, 0, 0
+AVX_INSTR packssdw, 0, 0
+AVX_INSTR packuswb, 0, 0
+AVX_INSTR packusdw, 0, 0
+AVX_INSTR paddb, 0, 0
+AVX_INSTR paddw, 0, 0
+AVX_INSTR paddd, 0, 0
+AVX_INSTR paddq, 0, 0
+AVX_INSTR paddsb, 0, 0
+AVX_INSTR paddsw, 0, 0
+AVX_INSTR paddusb, 0, 0
+AVX_INSTR paddusw, 0, 0
+AVX_INSTR palignr, 0, 1
+AVX_INSTR pand, 0, 0
+AVX_INSTR pandn, 0, 0
+AVX_INSTR pavgb, 0, 0
+AVX_INSTR pavgw, 0, 0
+AVX_INSTR pblendvb, 0, 0
+AVX_INSTR pblendw, 0, 1
+AVX_INSTR pcmpestri, 0, 0
+AVX_INSTR pcmpestrm, 0, 0
+AVX_INSTR pcmpistri, 0, 0
+AVX_INSTR pcmpistrm, 0, 0
+AVX_INSTR pcmpeqb, 0, 0
+AVX_INSTR pcmpeqw, 0, 0
+AVX_INSTR pcmpeqd, 0, 0
+AVX_INSTR pcmpeqq, 0, 0
+AVX_INSTR pcmpgtb, 0, 0
+AVX_INSTR pcmpgtw, 0, 0
+AVX_INSTR pcmpgtd, 0, 0
+AVX_INSTR pcmpgtq, 0, 0
+AVX_INSTR phaddw, 0, 0
+AVX_INSTR phaddd, 0, 0
+AVX_INSTR phaddsw, 0, 0
+AVX_INSTR phsubw, 0, 0
+AVX_INSTR phsubd, 0, 0
+AVX_INSTR phsubsw, 0, 0
+AVX_INSTR pmaddwd, 0, 0
+AVX_INSTR pmaddubsw, 0, 0
+AVX_INSTR pmaxsb, 0, 0
+AVX_INSTR pmaxsw, 0, 0
+AVX_INSTR pmaxsd, 0, 0
+AVX_INSTR pmaxub, 0, 0
+AVX_INSTR pmaxuw, 0, 0
+AVX_INSTR pmaxud, 0, 0
+AVX_INSTR pminsb, 0, 0
+AVX_INSTR pminsw, 0, 0
+AVX_INSTR pminsd, 0, 0
+AVX_INSTR pminub, 0, 0
+AVX_INSTR pminuw, 0, 0
+AVX_INSTR pminud, 0, 0
+AVX_INSTR pmulhuw, 0, 0
+AVX_INSTR pmulhrsw, 0, 0
+AVX_INSTR pmulhw, 0, 0
+AVX_INSTR pmullw, 0, 0
+AVX_INSTR pmulld, 0, 0
+AVX_INSTR pmuludq, 0, 0
+AVX_INSTR pmuldq, 0, 0
+AVX_INSTR por, 0, 0
+AVX_INSTR psadbw, 0, 0
+AVX_INSTR pshufb, 0, 0
+AVX_INSTR psignb, 0, 0
+AVX_INSTR psignw, 0, 0
+AVX_INSTR psignd, 0, 0
+AVX_INSTR psllw, 0, 0
+AVX_INSTR pslld, 0, 0
+AVX_INSTR psllq, 0, 0
+AVX_INSTR pslldq, 0, 0
+AVX_INSTR psraw, 0, 0
+AVX_INSTR psrad, 0, 0
+AVX_INSTR psrlw, 0, 0
+AVX_INSTR psrld, 0, 0
+AVX_INSTR psrlq, 0, 0
+AVX_INSTR psrldq, 0, 0
+AVX_INSTR psubb, 0, 0
+AVX_INSTR psubw, 0, 0
+AVX_INSTR psubd, 0, 0
+AVX_INSTR psubq, 0, 0
+AVX_INSTR psubsb, 0, 0
+AVX_INSTR psubsw, 0, 0
+AVX_INSTR psubusb, 0, 0
+AVX_INSTR psubusw, 0, 0
+AVX_INSTR punpckhbw, 0, 0
+AVX_INSTR punpckhwd, 0, 0
+AVX_INSTR punpckhdq, 0, 0
+AVX_INSTR punpckhqdq, 0, 0
+AVX_INSTR punpcklbw, 0, 0
+AVX_INSTR punpcklwd, 0, 0
+AVX_INSTR punpckldq, 0, 0
+AVX_INSTR punpcklqdq, 0, 0
+AVX_INSTR pxor, 0, 0
+AVX_INSTR subpd, 1, 0
+AVX_INSTR subps, 1, 0
+AVX_INSTR subsd, 1, 0
+AVX_INSTR subss, 1, 0
+AVX_INSTR unpckhpd, 1, 0
+AVX_INSTR unpckhps, 1, 0
+AVX_INSTR unpcklpd, 1, 0
+AVX_INSTR unpcklps, 1, 0
+AVX_INSTR xorpd, 1, 0
+AVX_INSTR xorps, 1, 0
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_AVX ? "avx" :
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
}
+ if( x264_cpu_detect() & X264_CPU_AVX )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
#include "x264_config.h"
-#define X264_BUILD 112
+#define X264_BUILD 113
/* x264_t:
* opaque handler for encoder */
#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
#define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
#define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */
+#define X264_CPU_AVX 0x400000 /* AVX support -- we don't currently use YMM registers, just
+ * the 3-operand capability, so we don't require OS support
+ * for AVX. */
/* Analyse flags
*/