From 68cda11b73471d090776cdbe5dbff7f8563fadb5 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Wed, 12 Jan 2011 09:54:33 -0800 Subject: [PATCH] Initial AVX support Automatically handle 3-operand instructions and abstraction between SSE and AVX. Implement one function with this (denoise_dct) as an initial test. x264 can't make much use of the 256-bit support of AVX (as it's float-only), but 3-operand could give some small benefits. --- common/cpu.c | 3 + common/quant.c | 5 + common/x86/quant-a.asm | 44 ++++---- common/x86/quant.h | 5 +- common/x86/x86inc.asm | 231 +++++++++++++++++++++++++++++++++++++++++ tools/checkasm.c | 3 + x264.h | 5 +- 7 files changed, 271 insertions(+), 25 deletions(-) diff --git a/common/cpu.c b/common/cpu.c index 3df4c701..742254cb 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -59,6 +59,7 @@ const x264_cpu_name_t x264_cpu_names[] = { {"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST}, {"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, {"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42}, + {"AVX", X264_CPU_AVX}, {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, {"SSEMisalign", X264_CPU_SSE_MISALIGN}, @@ -129,6 +130,8 @@ uint32_t x264_cpu_detect( void ) cpu |= X264_CPU_SSE4; if( ecx&0x00100000 ) cpu |= X264_CPU_SSE42; + if( ecx&0x10000000 ) + cpu |= X264_CPU_AVX; if( cpu & X264_CPU_SSSE3 ) cpu |= X264_CPU_SSE2_IS_FAST; diff --git a/common/quant.c b/common/quant.c index f854beb0..20c64353 100644 --- a/common/quant.c +++ b/common/quant.c @@ -474,6 +474,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4 = x264_quant_4x4_sse4; pf->quant_8x8 = x264_quant_8x8_sse4; } + + if( cpu&X264_CPU_AVX ) + { + pf->denoise_dct = x264_denoise_dct_avx; + } #endif // HAVE_MMX #if HAVE_ALTIVEC diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 91b15245..a50cb12c 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -804,28 +804,24 @@ cglobal denoise_dct_%1, 4,5,%2 mova m3, [r0+r3*2+1*mmsize] PABSW m0, m2 PABSW m1, m3 - mova m4, m0 - mova m5, m1 - psubusw m0, [r2+r3*2+0*mmsize] - psubusw m1, [r2+r3*2+1*mmsize] - PSIGNW m0, m2 - PSIGNW m1, m3 - mova [r0+r3*2+0*mmsize], m0 - mova [r0+r3*2+1*mmsize], m1 - mova m2, m4 - mova m3, m5 - punpcklwd m4, m6 - punpckhwd m2, m6 - punpcklwd m5, m6 - punpckhwd m3, m6 - paddd m4, [r1+r3*4+0*mmsize] - paddd m2, [r1+r3*4+1*mmsize] - paddd m5, [r1+r3*4+2*mmsize] - paddd m3, [r1+r3*4+3*mmsize] - mova [r1+r3*4+0*mmsize], m4 - mova [r1+r3*4+1*mmsize], m2 - mova [r1+r3*4+2*mmsize], m5 - mova [r1+r3*4+3*mmsize], m3 + psubusw m4, m0, [r2+r3*2+0*mmsize] + psubusw m5, m1, [r2+r3*2+1*mmsize] + PSIGNW m4, m2 + PSIGNW m5, m3 + mova [r0+r3*2+0*mmsize], m4 + mova [r0+r3*2+1*mmsize], m5 + punpcklwd m2, m0, m6 + punpcklwd m3, m1, m6 + punpckhwd m0, m6 + punpckhwd m1, m6 + paddd m2, [r1+r3*4+0*mmsize] + paddd m0, [r1+r3*4+1*mmsize] + paddd m3, [r1+r3*4+2*mmsize] + paddd m1, [r1+r3*4+3*mmsize] + mova [r1+r3*4+0*mmsize], m2 + mova [r1+r3*4+1*mmsize], m0 + mova [r1+r3*4+2*mmsize], m3 + mova [r1+r3*4+3*mmsize], m1 jg .loop mov [r0], r4w RET @@ -842,6 +838,8 @@ DENOISE_DCT sse2, 7 %define PABSW PABSW_SSSE3 %define PSIGNW PSIGNW_SSSE3 DENOISE_DCT ssse3, 7 +INIT_AVX +DENOISE_DCT avx, 7 %endif ; !HIGH_BIT_DEPTH @@ -970,12 +968,14 @@ cglobal decimate_score%1_%2, 1,3 %endmacro %ifndef ARCH_X86_64 +INIT_MMX %define DECIMATE_MASK DECIMATE_MASK_MMX DECIMATE4x4 15, mmxext, 0, 0 DECIMATE4x4 16, mmxext, 0, 0 DECIMATE4x4 15, mmxext_slowctz, 1, 0 DECIMATE4x4 16, mmxext_slowctz, 1, 0 %endif +INIT_XMM %define DECIMATE_MASK DECIMATE_MASK_SSE2 DECIMATE4x4 15, sse2, 0, 0 DECIMATE4x4 16, sse2, 0, 0 diff --git a/common/x86/quant.h b/common/x86/quant.h index df1890bf..9d80e2ea 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -54,9 +54,10 @@ void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_ void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); -void x264_denoise_dct_mmx( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); -void x264_denoise_dct_sse2( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); +void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); +void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); +void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); int x264_decimate_score15_mmxext( dctcoef *dct ); int x264_decimate_score15_sse2 ( dctcoef *dct ); int x264_decimate_score15_ssse3 ( dctcoef *dct ); diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index a76c5c11..6db50b1b 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -500,6 +500,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %endmacro %macro INIT_MMX 0 + %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %define mmsize 8 %define num_mmregs 8 @@ -521,6 +522,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %endmacro %macro INIT_XMM 0 + %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_XMM %define mmsize 16 %define num_mmregs 8 @@ -539,6 +541,12 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %endrep %endmacro +%macro INIT_AVX 0 + INIT_XMM + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_AVX +%endmacro + INIT_MMX ; I often want to use macros that permute their arguments. e.g. there's no @@ -646,3 +654,226 @@ INIT_MMX sub %1, %2 %endif %endmacro + +;============================================================================= +; AVX abstraction layer +;============================================================================= + +%define sizeofmm0 8 +%define sizeofmm1 8 +%define sizeofmm2 8 +%define sizeofmm3 8 +%define sizeofmm4 8 +%define sizeofmm5 8 +%define sizeofmm6 8 +%define sizeofmm7 8 +%define sizeofxmm0 16 +%define sizeofxmm1 16 +%define sizeofxmm2 16 +%define sizeofxmm3 16 +%define sizeofxmm4 16 +%define sizeofxmm5 16 +%define sizeofxmm6 16 +%define sizeofxmm7 16 +%define sizeofxmm8 16 +%define sizeofxmm9 16 +%define sizeofxmm10 16 +%define sizeofxmm11 16 +%define sizeofxmm12 16 +%define sizeofxmm13 16 +%define sizeofxmm14 16 +%define sizeofxmm15 16 + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm) +;%4 == number of operands given +;%5+: operands +%macro RUN_AVX_INSTR 6-7+ + %if sizeof%5==8 + %define %%regmov movq + %elif %2 + %define %%regmov movaps + %else + %define %%regmov movdqa + %endif + + %if %4>=3+%3 + %ifnidn %5, %6 + %if avx_enabled && sizeof%5==16 + v%1 %5, %6, %7 + %else + %%regmov %5, %6 + %1 %5, %7 + %endif + %else + %1 %5, %7 + %endif + %elif %3 + %1 %5, %6, %7 + %else + %1 %5, %6 + %endif +%endmacro + +;%1 == instruction +;%2 == 1 if float, 0 if int +;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm) +%macro AVX_INSTR 3 + %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 + %elifidn %4, fnord + RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3 + %elifidn %5, fnord + RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 + %else + RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 + %endif + %endmacro +%endmacro + +AVX_INSTR addpd, 1, 0 +AVX_INSTR addps, 1, 0 +AVX_INSTR addsd, 1, 0 +AVX_INSTR addss, 1, 0 +AVX_INSTR addsubpd, 1, 0 +AVX_INSTR addsubps, 1, 0 +AVX_INSTR andpd, 1, 0 +AVX_INSTR andps, 1, 0 +AVX_INSTR andnpd, 1, 0 +AVX_INSTR andnps, 1, 0 +AVX_INSTR blendpd, 1, 0 +AVX_INSTR blendps, 1, 0 +AVX_INSTR blendvpd, 1, 0 +AVX_INSTR blendvps, 1, 0 +AVX_INSTR cmppd, 1, 0 +AVX_INSTR cmpps, 1, 0 +AVX_INSTR cmpsd, 1, 0 +AVX_INSTR cmpss, 1, 0 +AVX_INSTR divpd, 1, 0 +AVX_INSTR divps, 1, 0 +AVX_INSTR divsd, 1, 0 +AVX_INSTR divss, 1, 0 +AVX_INSTR dppd, 1, 0 +AVX_INSTR dpps, 1, 0 +AVX_INSTR haddpd, 1, 0 +AVX_INSTR haddps, 1, 0 +AVX_INSTR hsubpd, 1, 0 +AVX_INSTR hsubps, 1, 0 +AVX_INSTR maxpd, 1, 0 +AVX_INSTR maxps, 1, 0 +AVX_INSTR maxsd, 1, 0 +AVX_INSTR maxss, 1, 0 +AVX_INSTR minpd, 1, 0 +AVX_INSTR minps, 1, 0 +AVX_INSTR minsd, 1, 0 +AVX_INSTR minss, 1, 0 +AVX_INSTR mpsadbw, 0, 1 +AVX_INSTR mulpd, 1, 0 +AVX_INSTR mulps, 1, 0 +AVX_INSTR mulsd, 1, 0 +AVX_INSTR mulss, 1, 0 +AVX_INSTR orpd, 1, 0 +AVX_INSTR orps, 1, 0 +AVX_INSTR packsswb, 0, 0 +AVX_INSTR packssdw, 0, 0 +AVX_INSTR packuswb, 0, 0 +AVX_INSTR packusdw, 0, 0 +AVX_INSTR paddb, 0, 0 +AVX_INSTR paddw, 0, 0 +AVX_INSTR paddd, 0, 0 +AVX_INSTR paddq, 0, 0 +AVX_INSTR paddsb, 0, 0 +AVX_INSTR paddsw, 0, 0 +AVX_INSTR paddusb, 0, 0 +AVX_INSTR paddusw, 0, 0 +AVX_INSTR palignr, 0, 1 +AVX_INSTR pand, 0, 0 +AVX_INSTR pandn, 0, 0 +AVX_INSTR pavgb, 0, 0 +AVX_INSTR pavgw, 0, 0 +AVX_INSTR pblendvb, 0, 0 +AVX_INSTR pblendw, 0, 1 +AVX_INSTR pcmpestri, 0, 0 +AVX_INSTR pcmpestrm, 0, 0 +AVX_INSTR pcmpistri, 0, 0 +AVX_INSTR pcmpistrm, 0, 0 +AVX_INSTR pcmpeqb, 0, 0 +AVX_INSTR pcmpeqw, 0, 0 +AVX_INSTR pcmpeqd, 0, 0 +AVX_INSTR pcmpeqq, 0, 0 +AVX_INSTR pcmpgtb, 0, 0 +AVX_INSTR pcmpgtw, 0, 0 +AVX_INSTR pcmpgtd, 0, 0 +AVX_INSTR pcmpgtq, 0, 0 +AVX_INSTR phaddw, 0, 0 +AVX_INSTR phaddd, 0, 0 +AVX_INSTR phaddsw, 0, 0 +AVX_INSTR phsubw, 0, 0 +AVX_INSTR phsubd, 0, 0 +AVX_INSTR phsubsw, 0, 0 +AVX_INSTR pmaddwd, 0, 0 +AVX_INSTR pmaddubsw, 0, 0 +AVX_INSTR pmaxsb, 0, 0 +AVX_INSTR pmaxsw, 0, 0 +AVX_INSTR pmaxsd, 0, 0 +AVX_INSTR pmaxub, 0, 0 +AVX_INSTR pmaxuw, 0, 0 +AVX_INSTR pmaxud, 0, 0 +AVX_INSTR pminsb, 0, 0 +AVX_INSTR pminsw, 0, 0 +AVX_INSTR pminsd, 0, 0 +AVX_INSTR pminub, 0, 0 +AVX_INSTR pminuw, 0, 0 +AVX_INSTR pminud, 0, 0 +AVX_INSTR pmulhuw, 0, 0 +AVX_INSTR pmulhrsw, 0, 0 +AVX_INSTR pmulhw, 0, 0 +AVX_INSTR pmullw, 0, 0 +AVX_INSTR pmulld, 0, 0 +AVX_INSTR pmuludq, 0, 0 +AVX_INSTR pmuldq, 0, 0 +AVX_INSTR por, 0, 0 +AVX_INSTR psadbw, 0, 0 +AVX_INSTR pshufb, 0, 0 +AVX_INSTR psignb, 0, 0 +AVX_INSTR psignw, 0, 0 +AVX_INSTR psignd, 0, 0 +AVX_INSTR psllw, 0, 0 +AVX_INSTR pslld, 0, 0 +AVX_INSTR psllq, 0, 0 +AVX_INSTR pslldq, 0, 0 +AVX_INSTR psraw, 0, 0 +AVX_INSTR psrad, 0, 0 +AVX_INSTR psrlw, 0, 0 +AVX_INSTR psrld, 0, 0 +AVX_INSTR psrlq, 0, 0 +AVX_INSTR psrldq, 0, 0 +AVX_INSTR psubb, 0, 0 +AVX_INSTR psubw, 0, 0 +AVX_INSTR psubd, 0, 0 +AVX_INSTR psubq, 0, 0 +AVX_INSTR psubsb, 0, 0 +AVX_INSTR psubsw, 0, 0 +AVX_INSTR psubusb, 0, 0 +AVX_INSTR psubusw, 0, 0 +AVX_INSTR punpckhbw, 0, 0 +AVX_INSTR punpckhwd, 0, 0 +AVX_INSTR punpckhdq, 0, 0 +AVX_INSTR punpckhqdq, 0, 0 +AVX_INSTR punpcklbw, 0, 0 +AVX_INSTR punpcklwd, 0, 0 +AVX_INSTR punpckldq, 0, 0 +AVX_INSTR punpcklqdq, 0, 0 +AVX_INSTR pxor, 0, 0 +AVX_INSTR subpd, 1, 0 +AVX_INSTR subps, 1, 0 +AVX_INSTR subsd, 1, 0 +AVX_INSTR subss, 1, 0 +AVX_INSTR unpckhpd, 1, 0 +AVX_INSTR unpckhps, 1, 0 +AVX_INSTR unpcklpd, 1, 0 +AVX_INSTR unpcklps, 1, 0 +AVX_INSTR xorpd, 1, 0 +AVX_INSTR xorps, 1, 0 diff --git a/tools/checkasm.c b/tools/checkasm.c index 30b5d6f9..925ba5b5 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -165,6 +165,7 @@ static void print_bench(void) if( k < j ) continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, + b->cpu&X264_CPU_AVX ? "avx" : b->cpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : @@ -2020,6 +2021,8 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); } + if( x264_cpu_detect() & X264_CPU_AVX ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); #elif ARCH_PPC if( x264_cpu_detect() & X264_CPU_ALTIVEC ) { diff --git a/x264.h b/x264.h index eba893e4..3c91c904 100644 --- a/x264.h +++ b/x264.h @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 112 +#define X264_BUILD 113 /* x264_t: * opaque handler for encoder */ @@ -122,6 +122,9 @@ typedef struct #define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ #define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */ #define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */ +#define X264_CPU_AVX 0x400000 /* AVX support -- we don't currently use YMM registers, just + * the 3-operand capability, so we don't require OS support + * for AVX. */ /* Analyse flags */ -- 2.40.0