From: Fiona Glaser Date: Mon, 12 Nov 2012 18:28:53 +0000 (-0800) Subject: AVX2/FMA3 version of mbtree_propagate X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ccda1ba4d8d902945c68aa25ec20867055d1b079;p=libx264 AVX2/FMA3 version of mbtree_propagate First AVX2 function for testing. Bump yasm version to 1.2.0 for AVX2 support. --- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index b0633794..7a36b798 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1702,7 +1702,7 @@ cglobal mbtree_propagate_cost, 7,7,7 %if cpuflag(fma4) cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 - vfmaddps xmm0, xmm0, xmm6, xmm1 + fmaddps xmm0, xmm0, xmm6, xmm1 cvtdq2ps xmm1, xmm2 psubd xmm2, xmm3 cvtdq2ps xmm2, xmm2 @@ -1710,7 +1710,7 @@ cglobal mbtree_propagate_cost, 7,7,7 mulps xmm1, xmm3 mulps xmm0, xmm2 addps xmm2, xmm3, xmm3 - vfnmaddps xmm3, xmm1, xmm3, xmm2 + fnmaddps xmm3, xmm1, xmm3, xmm2 mulps xmm0, xmm3 %else cvtdq2ps xmm0, xmm0 @@ -1742,14 +1742,18 @@ INIT_XMM fma4 MBTREE %macro INT16_TO_FLOAT 1 +%if cpuflag(avx2) + vpmovzxwd ymm%1, xmm%1 +%else vpunpckhwd xmm4, xmm%1, xmm7 vpunpcklwd xmm%1, xmm7 vinsertf128 ymm%1, ymm%1, xmm4, 1 +%endif vcvtdq2ps ymm%1, ymm%1 %endmacro ; FIXME: align loads/stores to 16 bytes -INIT_YMM avx +%macro MBTREE_AVX 0 cglobal mbtree_propagate_cost, 7,7,8 add r6d, r6d lea r0, [r0+r6*2] @@ -1761,7 +1765,9 @@ cglobal mbtree_propagate_cost, 7,7,8 vmovdqa xmm5, [pw_3fff] vbroadcastss ymm6, [r5] vmulps ymm6, ymm6, [pf_inv256] +%if notcpuflag(avx2) vpxor xmm7, xmm7 +%endif .loop: vmovdqu xmm0, [r2+r6] ; intra vmovdqu xmm1, [r4+r6] ; invq @@ -1771,6 +1777,17 @@ cglobal mbtree_propagate_cost, 7,7,8 INT16_TO_FLOAT 1 INT16_TO_FLOAT 2 INT16_TO_FLOAT 3 +%if cpuflag(fma3) + vmulps ymm1, ymm1, ymm0 + vsubps ymm4, ymm0, ymm3 + fmaddps ymm1, ymm1, ymm6, ymm2 + vrcpps ymm3, ymm0 + vmulps ymm2, ymm0, ymm3 + vmulps ymm1, ymm1, ymm4 + vaddps ymm4, ymm3, ymm3 + fnmaddps ymm4, ymm2, ymm3, ymm4 + vmulps ymm1, ymm1, ymm4 +%else vmulps ymm1, ymm1, ymm0 vsubps ymm4, ymm0, ymm3 vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8 @@ -1782,8 +1799,15 @@ cglobal mbtree_propagate_cost, 7,7,8 vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx) vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra vmulps ymm1, ymm1, ymm3 ; / intra +%endif vcvtps2dq ymm1, ymm1 vmovdqu [r0+r6*2], ymm1 add r6, 16 jl .loop RET +%endmacro + +INIT_YMM avx +MBTREE_AVX +INIT_YMM avx2,fma3 +MBTREE_AVX diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 8e587536..00aa8508 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -139,6 +139,8 @@ void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\ @@ -754,7 +756,12 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; - if( !(cpu&X264_CPU_FMA4) ) + if( cpu&X264_CPU_FMA4 ) + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4; + + if( !(cpu&X264_CPU_AVX2) ) return; - pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4; + + if( cpu&X264_CPU_FMA3 ) + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3; } diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 2eb6c652..9436f290 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -1361,3 +1361,45 @@ FMA_INSTR pmadcswd, pmaddwd, paddd ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. ; This lets us use tzcnt without bumping the yasm version requirement yet. %define tzcnt rep bsf + +; convert FMA4 to FMA3 if possible +%macro FMA4_INSTR 4 + %macro %1 4-8 %1, %2, %3, %4 + %if cpuflag(fma4) + v%5 %1, %2, %3, %4 + %elifidn %1, %2 + v%6 %1, %4, %3 ; %1 = %1 * %3 + %4 + %elifidn %1, %3 + v%7 %1, %2, %4 ; %1 = %2 * %1 + %4 + %elifidn %1, %4 + v%8 %1, %2, %3 ; %1 = %2 * %3 + %1 + %else + %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported + %endif + %endmacro +%endmacro + +FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd +FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps +FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd +FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss + +FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd +FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps +FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd +FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps + +FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd +FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps +FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd +FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss + +FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd +FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps +FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd +FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss + +FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd +FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps +FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd +FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss diff --git a/configure b/configure index 94285b27..cb8f6694 100755 --- a/configure +++ b/configure @@ -687,10 +687,10 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o fi if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then - if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then + if ! as_check "vpmovzxwd ymm0, xmm0" ; then VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1` echo "Found $VER" - echo "Minimum version is yasm-1.0.0" + echo "Minimum version is yasm-1.2.0" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi diff --git a/tools/checkasm.c b/tools/checkasm.c index abf581b5..8834865a 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -164,6 +164,7 @@ static void print_bench(void) if( k < j ) continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, + b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" : b->cpu&X264_CPU_AVX2 ? "avx2" : b->cpu&X264_CPU_FMA3 ? "fma3" : b->cpu&X264_CPU_FMA4 ? "fma4" : @@ -2444,11 +2445,6 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" ); cpu1 &= ~X264_CPU_FMA4; } - if( x264_cpu_detect() & X264_CPU_FMA3 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); - cpu1 &= ~X264_CPU_FMA3; - } if( x264_cpu_detect() & X264_CPU_BMI1 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); @@ -2466,6 +2462,11 @@ static int check_all_flags( void ) } if( x264_cpu_detect() & X264_CPU_AVX2 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + if( x264_cpu_detect() & X264_CPU_FMA3 ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); + cpu1 &= ~X264_CPU_FMA3; + } #elif ARCH_PPC if( x264_cpu_detect() & X264_CPU_ALTIVEC ) {