First AVX2 function for testing.
Bump yasm version to 1.2.0 for AVX2 support.
%if cpuflag(fma4)
cvtdq2ps xmm0, xmm0
cvtdq2ps xmm1, xmm1
- vfmaddps xmm0, xmm0, xmm6, xmm1
+ fmaddps xmm0, xmm0, xmm6, xmm1
cvtdq2ps xmm1, xmm2
psubd xmm2, xmm3
cvtdq2ps xmm2, xmm2
mulps xmm1, xmm3
mulps xmm0, xmm2
addps xmm2, xmm3, xmm3
- vfnmaddps xmm3, xmm1, xmm3, xmm2
+ fnmaddps xmm3, xmm1, xmm3, xmm2
mulps xmm0, xmm3
%else
cvtdq2ps xmm0, xmm0
MBTREE
%macro INT16_TO_FLOAT 1
+%if cpuflag(avx2)
+ vpmovzxwd ymm%1, xmm%1
+%else
vpunpckhwd xmm4, xmm%1, xmm7
vpunpcklwd xmm%1, xmm7
vinsertf128 ymm%1, ymm%1, xmm4, 1
+%endif
vcvtdq2ps ymm%1, ymm%1
%endmacro
; FIXME: align loads/stores to 16 bytes
-INIT_YMM avx
+%macro MBTREE_AVX 0
cglobal mbtree_propagate_cost, 7,7,8
add r6d, r6d
lea r0, [r0+r6*2]
vmovdqa xmm5, [pw_3fff]
vbroadcastss ymm6, [r5]
vmulps ymm6, ymm6, [pf_inv256]
+%if notcpuflag(avx2)
vpxor xmm7, xmm7
+%endif
.loop:
vmovdqu xmm0, [r2+r6] ; intra
vmovdqu xmm1, [r4+r6] ; invq
INT16_TO_FLOAT 1
INT16_TO_FLOAT 2
INT16_TO_FLOAT 3
+%if cpuflag(fma3)
+ vmulps ymm1, ymm1, ymm0
+ vsubps ymm4, ymm0, ymm3
+ fmaddps ymm1, ymm1, ymm6, ymm2
+ vrcpps ymm3, ymm0
+ vmulps ymm2, ymm0, ymm3
+ vmulps ymm1, ymm1, ymm4
+ vaddps ymm4, ymm3, ymm3
+ fnmaddps ymm4, ymm2, ymm3, ymm4
+ vmulps ymm1, ymm1, ymm4
+%else
vmulps ymm1, ymm1, ymm0
vsubps ymm4, ymm0, ymm3
vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
vmulps ymm1, ymm1, ymm3 ; / intra
+%endif
vcvtps2dq ymm1, ymm1
vmovdqu [r0+r6*2], ymm1
add r6, 16
jl .loop
RET
+%endmacro
+
+INIT_YMM avx
+MBTREE_AVX
+INIT_YMM avx2,fma3
+MBTREE_AVX
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
- if( !(cpu&X264_CPU_FMA4) )
+ if( cpu&X264_CPU_FMA4 )
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
+
+ if( !(cpu&X264_CPU_AVX2) )
return;
- pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
+
+ if( cpu&X264_CPU_FMA3 )
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
}
; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
; This lets us use tzcnt without bumping the yasm version requirement yet.
%define tzcnt rep bsf
+
+; convert FMA4 to FMA3 if possible
+%macro FMA4_INSTR 4
+ %macro %1 4-8 %1, %2, %3, %4
+ %if cpuflag(fma4)
+ v%5 %1, %2, %3, %4
+ %elifidn %1, %2
+ v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
+ %elifidn %1, %3
+ v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
+ %elifidn %1, %4
+ v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
+ %else
+ %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
+ %endif
+ %endmacro
+%endmacro
+
+FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
+FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
+FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
+FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
+
+FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
+FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
+FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
+FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
+
+FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
+FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
+FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
+FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
+
+FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
+FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
+FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
+FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
+
+FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
+FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
+FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
+FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
fi
if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
- if ! as_check "vpperm xmm0, xmm0, xmm0, xmm0" ; then
+ if ! as_check "vpmovzxwd ymm0, xmm0" ; then
VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1`
echo "Found $VER"
- echo "Minimum version is yasm-1.0.0"
+ echo "Minimum version is yasm-1.2.0"
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
if( k < j )
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
+ b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
cpu1 &= ~X264_CPU_FMA4;
}
- if( x264_cpu_detect() & X264_CPU_FMA3 )
- {
- ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
- cpu1 &= ~X264_CPU_FMA3;
- }
if( x264_cpu_detect() & X264_CPU_BMI1 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
}
if( x264_cpu_detect() & X264_CPU_AVX2 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+ if( x264_cpu_detect() & X264_CPU_FMA3 )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+ cpu1 &= ~X264_CPU_FMA3;
+ }
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{