From: Janne Grunau Date: Wed, 29 Oct 2014 17:17:48 +0000 (+0100) Subject: aarch64: x264_mbtree_propagate_{cost,list}_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8d655b63b4f7bc021ad038ea64b7c4de9d0ef74b;p=libx264 aarch64: x264_mbtree_propagate_{cost,list}_neon x264_mbtree_propagate_cost_neon is ~7 times faster. x264_mbtree_propagate_list_neon is 33% faster. --- diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S index 83652f2e..84074516 100644 --- a/common/aarch64/mc-a.S +++ b/common/aarch64/mc-a.S @@ -1484,3 +1484,120 @@ function integral_init8v_neon, export=1 2: ret endfunc + +function x264_mbtree_propagate_cost_neon, export=1 + ld1r {v5.4s}, [x5] +8: + subs w6, w6, #8 + ld1 {v1.8h}, [x1], #16 + ld1 {v2.8h}, [x2], #16 + ld1 {v3.8h}, [x3], #16 + ld1 {v4.8h}, [x4], #16 + bic v3.8h, #0xc0, lsl #8 + umin v3.8h, v2.8h, v3.8h + umull v20.4s, v2.4h, v4.4h // propagate_intra + umull2 v21.4s, v2.8h, v4.8h // propagate_intra + usubl v22.4s, v2.4h, v3.4h // propagate_num + usubl2 v23.4s, v2.8h, v3.8h // propagate_num + uxtl v26.4s, v2.4h // propagate_denom + uxtl2 v27.4s, v2.8h // propagate_denom + uxtl v24.4s, v1.4h + uxtl2 v25.4s, v1.8h + ucvtf v20.4s, v20.4s + ucvtf v21.4s, v21.4s + ucvtf v26.4s, v26.4s + ucvtf v27.4s, v27.4s + ucvtf v22.4s, v22.4s + ucvtf v23.4s, v23.4s + frecpe v28.4s, v26.4s + frecpe v29.4s, v27.4s + ucvtf v24.4s, v24.4s + ucvtf v25.4s, v25.4s + frecps v30.4s, v28.4s, v26.4s + frecps v31.4s, v29.4s, v27.4s + fmla v24.4s, v20.4s, v5.4s // propagate_amount + fmla v25.4s, v21.4s, v5.4s // propagate_amount + fmul v28.4s, v28.4s, v30.4s + fmul v29.4s, v29.4s, v31.4s + fmul v16.4s, v24.4s, v22.4s + fmul v17.4s, v25.4s, v23.4s + fmul v18.4s, v16.4s, v28.4s + fmul v19.4s, v17.4s, v29.4s + fcvtns v20.4s, v18.4s + fcvtns v21.4s, v19.4s + sqxtn v0.4h, v20.4s + sqxtn2 v0.8h, v21.4s + st1 {v0.8h}, [x0], #16 + b.ge 8b + ret +endfunc + +const pw_0to15, align=5 + .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +endconst + +function x264_mbtree_propagate_list_internal_neon, export=1 + movrel x11, pw_0to15 + dup v31.8h, w4 // bipred_weight + movi v30.8h, #0xc0, lsl #8 + ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y + movi v28.4s, #4//, lsl #16 + movi v27.8h, #31 + movi v26.8h, #32 + dup v24.8h, w5 // mb_y + zip1 v29.8h, v29.8h, v24.8h +8: + subs w6, w6, #8 + ld1 {v1.8h}, [x1], #16 // propagate_amount + ld1 {v2.8h}, [x2], #16 // lowres_cost + and v2.16b, v2.16b, v30.16b + cmeq v25.8h, v2.8h, v30.8h + umull v16.4s, v1.4h, v31.4h + umull2 v17.4s, v1.8h, v31.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 ) + // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 + ld1 {v4.8h,v5.8h}, [x0], #32 + sshr v6.8h, v4.8h, #5 + sshr v7.8h, v5.8h, #5 + add v6.8h, v6.8h, v29.8h + add v29.8h, v29.8h, v28.8h + add v7.8h, v7.8h, v29.8h + add v29.8h, v29.8h, v28.8h + st1 {v6.8h,v7.8h}, [x3], #32 + and v4.16b, v4.16b, v27.16b + and v5.16b, v5.16b, v27.16b + uzp1 v6.8h, v4.8h, v5.8h // x & 31 + uzp2 v7.8h, v4.8h, v5.8h // y & 31 + sub v4.8h, v26.8h, v6.8h // 32 - (x & 31) + sub v5.8h, v26.8h, v7.8h // 32 - (y & 31) + mul v19.8h, v6.8h, v7.8h // idx3weight = y*x; + mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x); + mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x; + mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ; + umull v6.4s, v19.4h, v25.4h + umull2 v7.4s, v19.8h, v25.8h + umull v4.4s, v18.4h, v25.4h + umull2 v5.4s, v18.8h, v25.8h + umull v2.4s, v17.4h, v25.4h + umull2 v3.4s, v17.8h, v25.8h + umull v0.4s, v16.4h, v25.4h + umull2 v1.4s, v16.8h, v25.8h + rshrn v19.4h, v6.4s, #10 + rshrn2 v19.8h, v7.4s, #10 + rshrn v18.4h, v4.4s, #10 + rshrn2 v18.8h, v5.4s, #10 + rshrn v17.4h, v2.4s, #10 + rshrn2 v17.8h, v3.4s, #10 + rshrn v16.4h, v0.4s, #10 + rshrn2 v16.8h, v1.4s, #10 + zip1 v0.8h, v16.8h, v17.8h + zip2 v1.8h, v16.8h, v17.8h + zip1 v2.8h, v18.8h, v19.8h + zip2 v3.8h, v18.8h, v19.8h + st1 {v0.8h,v1.8h}, [x3], #32 + st1 {v2.8h,v3.8h}, [x3], #32 + b.ge 8b + ret +endfunc diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c index f40fed62..96582d45 100644 --- a/common/aarch64/mc-c.c +++ b/common/aarch64/mc-c.c @@ -96,6 +96,8 @@ void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t ); void integral_init8v_neon( uint16_t *, intptr_t ); void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); +void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); + #if !HIGH_BIT_DEPTH static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) { @@ -201,6 +203,89 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, int height, int16_t *buf ); #endif // !HIGH_BIT_DEPTH +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) +#define CLIP_ADD2(s,x)\ +do\ +{\ + CLIP_ADD((s)[0], (x)[0]);\ + CLIP_ADD((s)[1], (x)[1]);\ +} while(0) + +void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2], + int16_t *propagate_amount, + uint16_t *lowres_costs, + int16_t *output, + int bipred_weight, int mb_y, + int len ); + +static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs, + int16_t (*mvs)[2], + int16_t *propagate_amount, + uint16_t *lowres_costs, + int bipred_weight, int mb_y, + int len, int list ) +{ + int16_t *current = h->scratch_buffer2; + + x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount, + lowres_costs, current, + bipred_weight, mb_y, len ); + + unsigned stride = h->mb.i_mb_stride; + unsigned width = h->mb.i_mb_width; + unsigned height = h->mb.i_mb_height; + + for( unsigned i = 0; i < len; current += 32 ) + { + int end = X264_MIN( i+8, len ); + for( ; i < end; i++, current += 2 ) + { + if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) ) + continue; + + unsigned mbx = current[0]; + unsigned mby = current[1]; + unsigned idx0 = mbx + mby * stride; + unsigned idx2 = idx0 + stride; + + /* Shortcut for the simple/common case of zero MV */ + if( !M32( mvs[i] ) ) + { + CLIP_ADD( ref_costs[idx0], current[16] ); + continue; + } + + if( mbx < width-1 && mby < height-1 ) + { + CLIP_ADD2( ref_costs+idx0, current+16 ); + CLIP_ADD2( ref_costs+idx2, current+32 ); + } + else + { + /* Note: this takes advantage of unsigned representation to + * catch negative mbx/mby. */ + if( mby < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx0+0], current[16] ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx0+1], current[17] ); + } + if( mby+1 < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx2+0], current[32] ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx2+1], current[33] ); + } + } + } + } +} + +#undef CLIP_ADD +#undef CLIP_ADD2 + void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) { #if !HIGH_BIT_DEPTH @@ -252,5 +337,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) pf->integral_init8h = integral_init8h_neon; pf->integral_init4v = integral_init4v_neon; pf->integral_init8v = integral_init8v_neon; + + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon; #endif // !HIGH_BIT_DEPTH }