PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
- PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
+ PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
- PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
+ PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
+ /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
+ prealloc_size += NATIVE_ALIGN;
}
if( h->param.rc.i_aq_mode )
{
cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
vbroadcastss m5, [r5]
mov r5d, r6m
- lea r0, [r0+r5*2]
+ lea r2, [r2+r5*2]
add r5d, r5d
- add r1, r5
- add r2, r5
- add r3, r5
add r4, r5
neg r5
+ sub r1, r5
+ sub r3, r5
+ sub r0, r5
mova xm4, [pw_3fff]
%if notcpuflag(avx2)
pxor xm7, xm7
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm4, [r3+r5] ; inter
pmovzxwd m3, xm3
- pminsd m3, m0
pmaddwd m1, m0
- psubd m3, m0, m3
+ psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm4, [r3+r5]
- pminsw xm3, xm0
+ psubusw xm3, xm0, xm3
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m1, m0
- subps m3, m0, m3
mulps m1, m5 ; intra*invq*fps_factor>>8
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
rcpps m2, m0 ; 1 / intra 1st approximation
subps m2, m0 ; 2nd approximation for 1/intra
mulps m1, m2 ; / intra
%endif
- vcvtps2dq m1, m1
+ cvtps2dq m1, m1
vextractf128 xm2, m1, 1
packssdw xm1, xm2
mova [r0+r5], xm1
INIT_YMM avx2
MBTREE_AVX
+INIT_ZMM avx512
+cglobal mbtree_propagate_cost, 6,6
+ vbroadcastss m5, [r5]
+ mov r5d, 0x3fff3fff
+ vpbroadcastd ym4, r5d
+ mov r5d, r6m
+ lea r2, [r2+r5*2]
+ add r5d, r5d
+ add r1, r5
+ neg r5
+ sub r4, r5
+ sub r3, r5
+ sub r0, r5
+.loop:
+ pmovzxwd m0, [r2+r5] ; intra
+ pmovzxwd m1, [r1+r5] ; prop
+ pmovzxwd m2, [r4+r5] ; invq
+ pand ym3, ym4, [r3+r5] ; inter
+ pmovzxwd m3, ym3
+ psubusw m3, m0, m3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ cvtdq2ps m2, m2
+ cvtdq2ps m3, m3
+ vdivps m1, m0, {rn-sae}
+ fmaddps m1, m2, m5, m1
+ mulps m1, m3
+ cvtps2dq m1, m1
+ vpmovsdw [r0+r5], m1
+ add r5, 32
+ jl .loop
+ RET
+
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
-void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
+
+ if( !(cpu&X264_CPU_AVX512) )
+ return;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
}