From: Janne Grunau <janne-x264@jannau.net>
Date: Wed, 29 Oct 2014 17:17:48 +0000 (+0100)
Subject: aarch64: x264_mbtree_propagate_{cost,list}_neon
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8d655b63b4f7bc021ad038ea64b7c4de9d0ef74b;p=libx264

aarch64: x264_mbtree_propagate_{cost,list}_neon

x264_mbtree_propagate_cost_neon is ~7 times faster.
x264_mbtree_propagate_list_neon is 33% faster.
---

diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index 83652f2e..84074516 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1484,3 +1484,120 @@ function integral_init8v_neon, export=1
 2:
     ret
 endfunc
+
+function x264_mbtree_propagate_cost_neon, export=1
+    ld1r        {v5.4s},  [x5]
+8:
+    subs        w6,  w6,  #8
+    ld1         {v1.8h},  [x1], #16
+    ld1         {v2.8h},  [x2], #16
+    ld1         {v3.8h},  [x3], #16
+    ld1         {v4.8h},  [x4], #16
+    bic         v3.8h,  #0xc0, lsl #8
+    umin        v3.8h,  v2.8h,  v3.8h
+    umull       v20.4s, v2.4h,  v4.4h   // propagate_intra
+    umull2      v21.4s, v2.8h,  v4.8h   // propagate_intra
+    usubl       v22.4s, v2.4h,  v3.4h   // propagate_num
+    usubl2      v23.4s, v2.8h,  v3.8h   // propagate_num
+    uxtl        v26.4s, v2.4h           // propagate_denom
+    uxtl2       v27.4s, v2.8h           // propagate_denom
+    uxtl        v24.4s, v1.4h
+    uxtl2       v25.4s, v1.8h
+    ucvtf       v20.4s, v20.4s
+    ucvtf       v21.4s, v21.4s
+    ucvtf       v26.4s, v26.4s
+    ucvtf       v27.4s, v27.4s
+    ucvtf       v22.4s, v22.4s
+    ucvtf       v23.4s, v23.4s
+    frecpe      v28.4s, v26.4s
+    frecpe      v29.4s, v27.4s
+    ucvtf       v24.4s, v24.4s
+    ucvtf       v25.4s, v25.4s
+    frecps      v30.4s, v28.4s, v26.4s
+    frecps      v31.4s, v29.4s, v27.4s
+    fmla        v24.4s, v20.4s, v5.4s   // propagate_amount
+    fmla        v25.4s, v21.4s, v5.4s   // propagate_amount
+    fmul        v28.4s, v28.4s, v30.4s
+    fmul        v29.4s, v29.4s, v31.4s
+    fmul        v16.4s, v24.4s, v22.4s
+    fmul        v17.4s, v25.4s, v23.4s
+    fmul        v18.4s, v16.4s, v28.4s
+    fmul        v19.4s, v17.4s, v29.4s
+    fcvtns      v20.4s, v18.4s
+    fcvtns      v21.4s, v19.4s
+    sqxtn       v0.4h,  v20.4s
+    sqxtn2      v0.8h,  v21.4s
+    st1         {v0.8h},  [x0], #16
+    b.ge        8b
+    ret
+endfunc
+
+const pw_0to15, align=5
+    .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+endconst
+
+function x264_mbtree_propagate_list_internal_neon, export=1
+    movrel      x11,  pw_0to15
+    dup         v31.8h,  w4             // bipred_weight
+    movi        v30.8h,  #0xc0, lsl #8
+    ld1         {v29.8h},  [x11] //h->mb.i_mb_x,h->mb.i_mb_y
+    movi        v28.4s,  #4//, lsl #16
+    movi        v27.8h,  #31
+    movi        v26.8h,  #32
+    dup         v24.8h,  w5             // mb_y
+    zip1        v29.8h,  v29.8h, v24.8h
+8:
+    subs        w6,  w6,  #8
+    ld1         {v1.8h},  [x1], #16     // propagate_amount
+    ld1         {v2.8h},  [x2], #16     // lowres_cost
+    and         v2.16b, v2.16b, v30.16b
+    cmeq        v25.8h, v2.8h,  v30.8h
+    umull       v16.4s, v1.4h,  v31.4h
+    umull2      v17.4s, v1.8h,  v31.8h
+    rshrn       v16.4h, v16.4s, #6
+    rshrn2      v16.8h, v17.4s, #6
+    bsl         v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
+    //          propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+    ld1         {v4.8h,v5.8h},  [x0],  #32
+    sshr        v6.8h,  v4.8h,  #5
+    sshr        v7.8h,  v5.8h,  #5
+    add         v6.8h,  v6.8h,  v29.8h
+    add         v29.8h, v29.8h, v28.8h
+    add         v7.8h,  v7.8h,  v29.8h
+    add         v29.8h, v29.8h, v28.8h
+    st1         {v6.8h,v7.8h},  [x3],  #32
+    and         v4.16b, v4.16b, v27.16b
+    and         v5.16b, v5.16b, v27.16b
+    uzp1        v6.8h,  v4.8h,  v5.8h   // x & 31
+    uzp2        v7.8h,  v4.8h,  v5.8h   // y & 31
+    sub         v4.8h,  v26.8h, v6.8h   // 32 - (x & 31)
+    sub         v5.8h,  v26.8h, v7.8h   // 32 - (y & 31)
+    mul         v19.8h, v6.8h,  v7.8h   // idx3weight = y*x;
+    mul         v18.8h, v4.8h,  v7.8h   // idx2weight = y*(32-x);
+    mul         v17.8h, v6.8h,  v5.8h   // idx1weight = (32-y)*x;
+    mul         v16.8h, v4.8h,  v5.8h   // idx0weight = (32-y)*(32-x) ;
+    umull       v6.4s,  v19.4h, v25.4h
+    umull2      v7.4s,  v19.8h, v25.8h
+    umull       v4.4s,  v18.4h, v25.4h
+    umull2      v5.4s,  v18.8h, v25.8h
+    umull       v2.4s,  v17.4h, v25.4h
+    umull2      v3.4s,  v17.8h, v25.8h
+    umull       v0.4s,  v16.4h, v25.4h
+    umull2      v1.4s,  v16.8h, v25.8h
+    rshrn       v19.4h, v6.4s,  #10
+    rshrn2      v19.8h, v7.4s,  #10
+    rshrn       v18.4h, v4.4s,  #10
+    rshrn2      v18.8h, v5.4s,  #10
+    rshrn       v17.4h, v2.4s,  #10
+    rshrn2      v17.8h, v3.4s,  #10
+    rshrn       v16.4h, v0.4s,  #10
+    rshrn2      v16.8h, v1.4s,  #10
+    zip1        v0.8h,  v16.8h, v17.8h
+    zip2        v1.8h,  v16.8h, v17.8h
+    zip1        v2.8h,  v18.8h, v19.8h
+    zip2        v3.8h,  v18.8h, v19.8h
+    st1         {v0.8h,v1.8h},  [x3], #32
+    st1         {v2.8h,v3.8h},  [x3], #32
+    b.ge        8b
+    ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index f40fed62..96582d45 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -96,6 +96,8 @@ void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
 void integral_init8v_neon( uint16_t *, intptr_t );
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -201,6 +203,89 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                             int height, int16_t *buf );
 #endif // !HIGH_BIT_DEPTH
 
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    CLIP_ADD((s)[0], (x)[0]);\
+    CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+
+void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
+                                               int16_t *propagate_amount,
+                                               uint16_t *lowres_costs,
+                                               int16_t *output,
+                                               int bipred_weight, int mb_y,
+                                               int len );
+
+static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
+                                             int16_t (*mvs)[2],
+                                             int16_t *propagate_amount,
+                                             uint16_t *lowres_costs,
+                                             int bipred_weight, int mb_y,
+                                             int len, int list )
+{
+    int16_t *current = h->scratch_buffer2;
+
+    x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
+                                              lowres_costs, current,
+                                              bipred_weight, mb_y, len );
+
+    unsigned stride = h->mb.i_mb_stride;
+    unsigned width = h->mb.i_mb_width;
+    unsigned height = h->mb.i_mb_height;
+
+    for( unsigned i = 0; i < len; current += 32 )
+    {
+        int end = X264_MIN( i+8, len );
+        for( ; i < end; i++, current += 2 )
+        {
+            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
+                continue;
+
+            unsigned mbx = current[0];
+            unsigned mby = current[1];
+            unsigned idx0 = mbx + mby * stride;
+            unsigned idx2 = idx0 + stride;
+
+            /* Shortcut for the simple/common case of zero MV */
+            if( !M32( mvs[i] ) )
+            {
+                CLIP_ADD( ref_costs[idx0], current[16] );
+                continue;
+            }
+
+            if( mbx < width-1 && mby < height-1 )
+            {
+                CLIP_ADD2( ref_costs+idx0, current+16 );
+                CLIP_ADD2( ref_costs+idx2, current+32 );
+            }
+            else
+            {
+                /* Note: this takes advantage of unsigned representation to
+                 * catch negative mbx/mby. */
+                if( mby < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx0+0], current[16] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx0+1], current[17] );
+                }
+                if( mby+1 < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx2+0], current[32] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx2+1], current[33] );
+                }
+            }
+        }
+    }
+}
+
+#undef CLIP_ADD
+#undef CLIP_ADD2
+
 void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
 {
 #if !HIGH_BIT_DEPTH
@@ -252,5 +337,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
     pf->integral_init8h = integral_init8h_neon;
     pf->integral_init4v = integral_init4v_neon;
     pf->integral_init8v = integral_init8v_neon;
+
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
 #endif // !HIGH_BIT_DEPTH
 }