From: Henrik Gramner Date: Wed, 7 Sep 2016 17:27:31 +0000 (+0200) Subject: x86: AVX2 mbtree_propagate_list X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0ce77f9eb71051c9a6121ec12c2abaac99ee628a;p=libx264 x86: AVX2 mbtree_propagate_list SIMD part is around 25% faster than AVX on Haswell, around 7% faster when including the runtime of the scalar C wrapper. --- diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index dba36d26..ea61c812 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -38,6 +38,8 @@ const pw_00ff, times 16 dw 0x00ff const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 const pd_1, times 8 dd 1 +const pd_0123, dd 0,1,2,3 +const pd_4567, dd 4,5,6,7 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7 @@ -63,6 +65,7 @@ const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 +const pd_8, times 4 dd 8 const pd_32, times 4 dd 32 const pd_1024, times 4 dd 1024 const pd_ffff, times 4 dd 0xffff diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index f5c34187..b2b56411 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -94,6 +94,8 @@ cextern pw_00ff cextern pw_3fff cextern pw_pixel_max cextern pw_0to15 +cextern pd_8 +cextern pd_0123 cextern pd_ffff %macro LOAD_ADD 4 @@ -2178,7 +2180,7 @@ MBTREE_AVX %macro MBTREE_PROPAGATE_LIST 0 ;----------------------------------------------------------------------------- -; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs, +; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, ; int16_t *output, int bipred_weight, int mb_y, int len ) ;----------------------------------------------------------------------------- cglobal mbtree_propagate_list_internal, 4,6,8 @@ -2268,6 +2270,67 @@ MBTREE_PROPAGATE_LIST INIT_XMM avx MBTREE_PROPAGATE_LIST +INIT_YMM avx2 +cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8 + mova xm4, [pw_0xc000] +%if UNIX64 + shl r4d, 9 + shl r5d, 16 + movd xm5, r4d + movd xm6, r5d + vpbroadcastw xm5, xm5 + vpbroadcastd m6, xm6 +%else + vpbroadcastw xm5, r4m + vpbroadcastd m6, r5m + psllw xm5, 9 ; bipred_weight << 9 + pslld m6, 16 +%endif + mov r4d, r6m + lea r1, [r1+r4*2] + lea r2, [r2+r4*2] + lea r0, [r0+r4*4] + neg r4 + por m6, [pd_0123] ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y + vbroadcasti128 m7, [pw_31] +.loop: + mova xm3, [r1+r4*2] + pand xm0, xm4, [r2+r4*2] + pmulhrsw xm1, xm3, xm5 ; bipred_amount = (propagate_amount * bipred_weight + 32) >> 6 + pcmpeqw xm0, xm4 + pblendvb xm3, xm3, xm1, xm0 ; (lists_used == 3) ? bipred_amount : propagate_amount + vpermq m3, m3, q1100 + + movu m0, [r0+r4*4] ; {x, y} + vbroadcasti128 m1, [pd_8] + psraw m2, m0, 5 + paddw m2, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y} + paddw m6, m1 ; i_mb_x += 8 + mova [r3], m2 + + mova m1, [pw_32] + pand m0, m7 + psubw m1, m0 + packuswb m1, m0 ; {32-x, 32-y} {x, y} {32-x, 32-y} {x, y} + psrlw m0, m1, 3 + pand m1, [pw_00ff] ; 32-x x 32-x x + pandn m0, m7, m0 ; (32-y y 32-y y) << 5 + pshufd m2, m1, q1032 + pmullw m1, m0 ; idx0 idx3 idx0 idx3 + pmullw m2, m0 ; idx1 idx2 idx1 idx2 + + pmulhrsw m0, m1, m3 ; (idx0 idx3 idx0 idx3) * propagate_amount + 512 >> 10 + pmulhrsw m2, m3 ; (idx1 idx2 idx1 idx2) * propagate_amount + 512 >> 10 + psignw m0, m1 ; correct potential overflow in the idx0 input to pmulhrsw + punpcklwd m1, m0, m2 ; idx01weight + punpckhwd m2, m0 ; idx23weight + mova [r3+32], m1 + mova [r3+64], m2 + add r3, 3*mmsize + add r4, 8 + jl .loop + RET + %macro MBTREE_FIX8 0 ;----------------------------------------------------------------------------- ; void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 21acdebd..3bff4080 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -532,6 +532,7 @@ do\ PROPAGATE_LIST(ssse3) PROPAGATE_LIST(avx) +PROPAGATE_LIST(avx2) void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { @@ -843,6 +844,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy_swap = x264_plane_copy_swap_avx2; pf->get_ref = get_ref_avx2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2; } diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm index bb1282df..0c25914a 100644 --- a/common/x86/trellis-64.asm +++ b/common/x86/trellis-64.asm @@ -53,14 +53,14 @@ SECTION_RODATA -pd_8: times 4 dd 8 pd_m16: times 4 dd -16 -pd_0123: dd 0, 1, 2, 3 -pd_4567: dd 4, 5, 6, 7 sq_1: dq 1, 0 pq_128: times 2 dq 128 pq_ffffffff: times 2 dq 0xffffffff +cextern pd_8 +cextern pd_0123 +cextern pd_4567 cextern cabac_entropy cextern cabac_transition cextern cabac_size_unary