]> granicus.if.org Git - libx264/commitdiff
ARM: update NEON mc_chroma to work with NV12 and re-enable it
authorStefan Groenroos <stefan.gronroos@gmail.com>
Mon, 25 Feb 2013 21:43:09 +0000 (23:43 +0200)
committerFiona Glaser <fiona@x264.com>
Tue, 26 Feb 2013 23:13:17 +0000 (15:13 -0800)
Up to 10-15% faster overall.

common/arm/mc-a.S
common/arm/mc-c.c

index d033449a2257e336af0c115e91c5e31ff4bbe999..8a37e95742fd1a367b58841eb05a24c39bb15cb2 100644 (file)
@@ -5,6 +5,7 @@
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Mans Rullgard <mans@mansr.com>
+ *          Stefan Groenroos <stefan.gronroos@gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -813,54 +814,57 @@ copy_w16_aligned_loop:
 // void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride,
 //                           uint8_t *src, intptr_t i_src_stride,
 //                           int dx, int dy, int i_width, int i_height );
+
 function x264_mc_chroma_neon
-    push            {r4-r6, lr}
-    ldrd            r4,  [sp, #16]
-    ldr             r6,  [sp, #24]
+    push            {r4-r8, lr}
+    vpush           {d8-d11}
+    ldrd            r4, [sp, #56]
+    ldrd            r6, [sp, #64]
 
-    asr             lr,  r5,  #3
-    mul             lr,  r3,  lr
-    add             r2,  r2,  r4,  asr #3
-    cmp             r6, #4
-    add             r2,  r2,  lr
+    asr             lr, r6, #3
+    mul             lr, r4, lr
+    add             r3, r3, r5, asr #2
+    cmp             r7, #4
 
-    and             r4, r4, #7
     and             r5, r5, #7
-    pld             [r2]
-    pld             [r2, r3]
+    and             r6, r6, #7
+
+    add             r3, r3, lr
+    bic             r3, r3, #0x1
+
+    pld             [r3]
+    pld             [r3, r4]
 
     bgt             mc_chroma_w8
     beq             mc_chroma_w4
 
-// calculate cA cB cC cD
-.macro CHROMA_MC_START r0 r1
-    muls            lr,  r4,  r5
-    rsb             r6,  lr,  r5,  lsl #3
-    rsb             ip,  lr,  r4,  lsl #3
-    sub             r4,  lr,  r4,  lsl #3
-    sub             r4,  r4,  r5,  lsl #3
-    add             r4,  r4,  #64
+.macro CHROMA_MC_START r00, r01, r10, r11
+    muls            lr, r5, r6
+    rsb             r7, lr, r6, lsl #3
+    rsb             ip, lr, r5, lsl #3
+    sub             r5, lr, r5, lsl #3
+    sub             r5, r5, r6, lsl #3
+    add             r5, r5, #64
 
     beq             2f
+    vld2.8          {\r00-\r01}, [r3], r4
 
-    add             r5,  r2,  r3
+    vdup.8          d0,    r5
+    vdup.8          d1,    ip
 
-    vdup.8          d0,  r4
-    lsl             r3,  r3,  #1
-    vdup.8          d1,  ip
-    vld1.64         {\r0}, [r2], r3
-    vdup.8          d2,  r6
-    vld1.64         {\r1}, [r5], r3
-    vdup.8          d3,  lr
-    ldr             r4,  [sp, #28]
-
-    vext.8          d5,  d4,  d5,  #1
-    vext.8          d7,  d6,  d7,  #1
+    vdup.8          d2,    r7
+    vld2.8          {\r10-\r11}, [r3], r4
+    vdup.8          d3,    lr
+    ldr             r5,    [sp, #72]
 .endm
 
 .macro CHROMA_MC width, align
 mc_chroma_w\width:
-    CHROMA_MC_START d4,  d6
+    CHROMA_MC_START d4, d5,  d8, d9
+    vext.8          d6,  d4,  d6,  #1
+    vext.8          d7,  d5,  d7,  #1
+    vext.8          d10, d8,  d10, #1
+    vext.8          d11, d9,  d11, #1
 // since the element size varies, there's a different index for the 2nd store
 .if \width == 4
     .set st2, 1
@@ -868,187 +872,292 @@ mc_chroma_w\width:
     .set st2, 2
 .endif
 
-    vtrn.32         d4,  d5
-    vtrn.32         d6,  d7
+    vtrn.32         d4, d6
+    vtrn.32         d5, d7
+    vtrn.32         d8, d10
+    vtrn.32         d9, d11
 
-    vtrn.32         d0,  d1
-    vtrn.32         d2,  d3
+    vtrn.32         d0, d1
+    vtrn.32         d2, d3
 
 1:  // height loop, interpolate xy
-    pld             [r5]
+
     vmull.u8        q8,  d4,  d0
-    vmlal.u8        q8,  d6,  d2
-    vld1.64         {d4},     [r2], r3
-    vext.8          d5,  d4,  d5,  #1
-    vtrn.32         d4,  d5
-    vmull.u8        q9,  d6,  d0
-    vmlal.u8        q9,  d4,  d2
-    vld1.64         {d6},     [r5], r3
+    vmlal.u8        q8,  d8,  d2
+    vmull.u8        q9,  d5,  d0
+    vmlal.u8        q9,  d9,  d2
+
+    vld2.8          {d4-d5},  [r3], r4
+
+    vext.8          d6,  d4,  d6,  #1
+    vext.8          d7,  d5,  d7,  #1
+
     vadd.i16        d16, d16, d17
     vadd.i16        d17, d18, d19
+
+    vtrn.32         d4,  d6
+    vtrn.32         d5,  d7
+
+    vmull.u8        q10, d8,  d0
+    vmlal.u8        q10, d4,  d2
+    vmull.u8        q11, d9,  d0
+    vmlal.u8        q11, d5,  d2
+
+    vld2.8          {d8-d9},  [r3], r4
+
     vrshrn.u16      d16, q8,  #6
-    subs            r4,  r4,  #2
-    pld             [r2]
-    vext.8          d7,  d6,  d7,  #1
-    vtrn.32         d6,  d7
-    vst1.\align     {d16[0]},   [r0,:\align], r1
-    vst1.\align     {d16[st2]}, [r0,:\align], r1
+
+    vext.8          d10, d8,  d10,  #1
+    vext.8          d11, d9,  d11,  #1
+
+    vadd.i16        d18, d20, d21
+    vadd.i16        d19, d22, d23
+
+    vtrn.32         d8, d10
+    vtrn.32         d9, d11
+
+    vrshrn.u16      d18, q9,  #6
+
+    subs            r5,  r5,  #2
+
+    pld             [r3]
+    pld             [r3, r4]
+
+    vst1.\align     {d16[0]},   [r0,:\align], r2
+    vst1.\align     {d16[st2]}, [r1,:\align], r2
+    vst1.\align     {d18[0]},   [r0,:\align], r2
+    vst1.\align     {d18[st2]}, [r1,:\align], r2
     bgt             1b
 
-    pop             {r4-r6, pc}
+    vpop            {d8-d11}
+    pop             {r4-r8, pc}
 
 2:  // dx or dy are 0
-    tst             r6,  r6
-    add             ip,  ip,  r6
-    vdup.8          d0,  r4
+    tst             r7,  r7
+    add             ip,  ip,  r7
+    vdup.8          d0,  r5
+    ldr             r5,  [sp, #72]
     vdup.8          d1,  ip
-    vtrn.32         d0,  d1
-    ldr             r4,  [sp, #28]
 
     beq             4f
 
-    vext.32         d1,  d0,  d1,  #1
-    add             r5,  r2,  r3
-    lsl             r3,  r3,  #1
-    vld1.32         {d4[0]},  [r2], r3
-    vld1.32         {d4[1]},  [r5], r3
+    vld1.64          {d4}, [r3], r4
+    vld1.64          {d6}, [r3], r4
 
 3:  // vertical interpolation loop
-    pld             [r5]
+
     vmull.u8        q8,  d4,  d0
-    vld1.32         {d4[0]},  [r2], r3
-    vmull.u8        q9,  d4,  d1
-    vld1.32         {d4[1]},  [r5], r3
-    vadd.i16        d16, d16, d17
-    vadd.i16        d17, d18, d19
-    vrshrn.u16      d16, q8,  #6
-    subs            r4,  r4,  #2
-    pld             [r2]
-    vst1.\align     {d16[0]},   [r0,:\align], r1
-    vst1.\align     {d16[st2]}, [r0,:\align], r1
+    vmlal.u8        q8,  d6,  d1
+    vmull.u8        q9,  d6,  d0
+    vld1.64         {d4}, [r3], r4
+    vmlal.u8        q9,  d4,  d1
+    vld1.64         {d6}, [r3], r4
+
+    vrshrn.u16      d16, q8,  #6 // uvuvuvuv
+    vrshrn.u16      d17, q9,  #6 // uvuvuvuv
+    subs            r5,  r5,  #2
+    vuzp.8          d16, d17 // d16=uuuu|uuuu, d17=vvvv|vvvv
+
+    pld             [r3]
+    pld             [r3, r4]
+
+    vst1.\align     {d16[0]},   [r0,:\align], r2
+    vst1.\align     {d16[st2]}, [r0,:\align], r2
+    vst1.\align     {d17[0]},   [r1,:\align], r2
+    vst1.\align     {d17[st2]}, [r1,:\align], r2
     bgt             3b
 
-    pop             {r4-r6, pc}
+    vpop            {d8-d11}
+    pop             {r4-r8, pc}
 
 4:  // dy is 0
-    vld1.64         {d4},     [r2], r3
-    vld1.64         {d6},     [r2], r3
-    vext.8          d5,  d4,  d5,  #1
-    vext.8          d7,  d6,  d7,  #1
-    vtrn.32         d4,  d5
-    vtrn.32         d6,  d7
+
+    vld1.64         {d4-d5},  [r3], r4
+    vld1.64         {d6-d7},  [r3], r4
+
+    vext.8          d5,  d4,  d5,  #2
+    vext.8          d7,  d6,  d7,  #2
 
 5:  // horizontal interpolation loop
+
     vmull.u8        q8,  d4,  d0
+    vmlal.u8        q8,  d5,  d1
     vmull.u8        q9,  d6,  d0
-    subs            r4,  r4,  #2
-    vld1.64         {d4},     [r2], r3
-    vext.8          d5,  d4,  d5,  #1
-    vtrn.32         d4,  d5
-    vadd.i16        d16, d16, d17
-    vadd.i16        d17, d18, d19
-    pld             [r2]
+    vmlal.u8        q9,  d7,  d1
+
+    subs            r5,  r5,  #2
+    vld1.64         {d4-d5},  [r3], r4
+    vld1.64         {d6-d7},  [r3], r4
+    vext.8          d5,  d4,  d5,  #2
     vrshrn.u16      d16, q8,  #6
-    vld1.64         {d6},     [r2], r3
-    vext.8          d7,  d6,  d7,  #1
-    vtrn.32         d6,  d7
-    pld             [r2]
-    vst1.\align     {d16[0]},   [r0,:\align], r1
-    vst1.\align     {d16[st2]}, [r0,:\align], r1
+    vrshrn.u16      d17, q9,  #6
+    vext.8          d7,  d6,  d7,  #2
+    vuzp.8          d16, d17
+
+    pld             [r3]
+    pld             [r3, r4]
+
+    vst1.\align     {d16[0]},   [r0,:\align], r2
+    vst1.\align     {d16[st2]}, [r0,:\align], r2
+    vst1.\align     {d17[0]},   [r1,:\align], r2
+    vst1.\align     {d17[st2]}, [r1,:\align], r2
     bgt             5b
 
-    pop             {r4-r6, pc}
+    vpop            {d8-d11}
+    pop             {r4-r8, pc}
 .endm
 
-    CHROMA_MC 2, 16
-    CHROMA_MC 4, 32
+   CHROMA_MC 2, 16
+   CHROMA_MC 4, 32
 
-// the optimial timing for width 8 is different enough that it's not
-// readable to put it in the same macro as width 2/4
 mc_chroma_w8:
-    CHROMA_MC_START d4-d5, d6-d7
+    CHROMA_MC_START d4, d7, d8, d11
+    vext.8          d5,  d4,  d5,  #1
+    vext.8          d9,  d8,  d9,  #1
+    vext.8          d7,  d6,  d7,  #1
+    vext.8          d11, d10, d11,  #1
 
 1:  // height loop, interpolate xy
-    pld             [r5]
     vmull.u8        q8,  d4,  d0
     vmlal.u8        q8,  d5,  d1
-    vld1.64         {d4, d5}, [r2], r3
-    vmlal.u8        q8,  d6,  d2
-    vext.8          d5,  d4,  d5,  #1
-    vmlal.u8        q8,  d7,  d3
+    vmlal.u8        q8,  d8,  d2
+    vmlal.u8        q8,  d9,  d3
+
     vmull.u8        q9,  d6,  d0
-    subs            r4,  r4,  #2
     vmlal.u8        q9,  d7,  d1
-    vmlal.u8        q9,  d4,  d2
-    vmlal.u8        q9,  d5,  d3
+    vmlal.u8        q9,  d10,  d2
+    vmlal.u8        q9,  d11,  d3
+
+    vld2.8          {d4-d7}, [r3], r4
+
+    vext.8          d5,  d4,  d5,  #1
+    vext.8          d7,  d6,  d7,  #1
+
+    vmull.u8        q10, d8,   d0
+    vmlal.u8        q10, d9,   d1
+    vmlal.u8        q10, d4,   d2
+    vmlal.u8        q10, d5,   d3
+
+    vmull.u8        q11, d10,  d0
+    vmlal.u8        q11, d11,  d1
+    vmlal.u8        q11, d6,   d2
+    vmlal.u8        q11, d7,   d3
+
+    subs            r5,  r5,   #2
+    vld2.8          {d8-d11}, [r3], r4
+
     vrshrn.u16      d16, q8,  #6
-    vld1.64         {d6, d7}, [r5], r3
-    pld             [r2]
     vrshrn.u16      d17, q9,  #6
-    vext.8          d7,  d6,  d7,  #1
-    vst1.64         {d16}, [r0,:64], r1
-    vst1.64         {d17}, [r0,:64], r1
+    vrshrn.u16      d18, q10, #6
+    vext.8          d9,  d8,  d9,  #1
+    vrshrn.u16      d19, q11, #6
+    vext.8          d11, d10, d11,  #1
+
+    pld             [r3]
+    pld             [r3, r4]
+
+    vst1.64         {d16}, [r0,:64], r2
+    vst1.64         {d17}, [r1,:64], r2
+    vst1.64         {d18}, [r0,:64], r2
+    vst1.64         {d19}, [r1,:64], r2
+
     bgt             1b
 
-    pop             {r4-r6, pc}
+    vpop            {d8-d11}
+    pop             {r4-r8, pc}
 
 2:  // dx or dy are 0
-    tst             r6,  r6
-    add             ip,  ip,  r6
-    vdup.8          d0,  r4
+    tst             r7,  r7
+    add             ip,  ip,  r7
+    vdup.8          d0,  r5
+    ldr             r5,  [sp, #72]
     vdup.8          d1,  ip
-    ldr             r4,  [sp, #28]
 
     beq             4f
 
-    add             r5,  r2,  r3
-    lsl             r3,  r3,  #1
-    vld1.64         {d4}, [r2], r3
-    vld1.64         {d6}, [r5], r3
+    vld2.8          {d4-d5}, [r3], r4
+    vld2.8          {d6-d7}, [r3], r4
 
 3:  // vertical interpolation loop
-    pld             [r5]
-    vmull.u8        q8,  d4,  d0
+    vmull.u8        q8,  d4,  d0 //U
     vmlal.u8        q8,  d6,  d1
-    vld1.64         {d4}, [r2], r3
-    vmull.u8        q9,  d6,  d0
-    vmlal.u8        q9,  d4,  d1
-    vld1.64         {d6}, [r5], r3
+    vmull.u8        q9,  d5,  d0 //V
+    vmlal.u8        q9,  d7,  d1
+
+    vld2.8          {d4-d5}, [r3], r4
+
+    vmull.u8        q10, d6,  d0
+    vmlal.u8        q10, d4,  d1
+    vmull.u8        q11, d7,  d0
+    vmlal.u8        q11, d5,  d1
+
+    vld2.8          {d6-d7}, [r3], r4
+
     vrshrn.u16      d16, q8,  #6
     vrshrn.u16      d17, q9,  #6
-    subs            r4,  r4,  #2
-    pld             [r2]
-    vst1.64         {d16}, [r0,:64], r1
-    vst1.64         {d17}, [r0,:64], r1
+    vrshrn.u16      d18, q10, #6
+    vrshrn.u16      d19, q11, #6
+    subs            r5,  r5,  #2
+
+    pld             [r3]
+    pld             [r3, r4]
+
+    vst1.64         {d16}, [r0,:64], r2
+    vst1.64         {d17}, [r1,:64], r2
+    vst1.64         {d18}, [r0,:64], r2
+    vst1.64         {d19}, [r1,:64], r2
+
     bgt             3b
 
-    pop             {r4-r6, pc}
+    vpop            {d8-d11}
+    pop             {r4-r8, pc}
 
 4:  // dy is 0
-    vld1.64         {d4, d5}, [r2], r3
-    vld1.64         {d6, d7}, [r2], r3
+
+    vld2.8          {d4-d7},  [r3], r4
+    vld2.8          {d8-d11}, [r3], r4
     vext.8          d5,  d4,  d5,  #1
     vext.8          d7,  d6,  d7,  #1
+    vext.8          d9,  d8,  d9,  #1
+    vext.8          d11, d10, d11, #1
 
 5:  // horizontal interpolation loop
-    pld             [r2]
-    subs            r4,  r4,  #2
-    vmull.u8        q8,  d4,  d0
+    subs            r5,  r5,  #2
+    vmull.u8        q8,  d4,  d0 //U
     vmlal.u8        q8,  d5,  d1
-    vld1.64         {d4,  d5}, [r2], r3
-    vmull.u8        q9,  d6,  d0
+    vmull.u8        q9,  d6,  d0 //V
     vmlal.u8        q9,  d7,  d1
-    pld             [r2]
+
+    vld2.8          {d4-d7}, [r3], r4
+
+    vmull.u8        q10, d8,  d0
+    vmlal.u8        q10, d9,  d1
+    vmull.u8        q11, d10, d0
+    vmlal.u8        q11, d11, d1
+
+    vld2.8          {d8-d11}, [r3], r4
+
     vext.8          d5,  d4,  d5,  #1
     vrshrn.u16      d16, q8,  #6
-    vrshrn.u16      d17, q9,  #6
-    vld1.64         {d6, d7}, [r2], r3
     vext.8          d7,  d6,  d7,  #1
-    vst1.64         {d16}, [r0,:64], r1
-    vst1.64         {d17}, [r0,:64], r1
+    vrshrn.u16      d17, q9,  #6
+    vext.8          d9,  d8,  d9,  #1
+    vrshrn.u16      d18, q10, #6
+    vext.8          d11, d10, d11, #1
+    vrshrn.u16      d19, q11, #6
+
+    pld             [r3]
+    pld             [r3, r4]
+
+    vst1.64         {d16}, [r0,:64], r2
+    vst1.64         {d17}, [r1,:64], r2
+    vst1.64         {d18}, [r0,:64], r2
+    vst1.64         {d19}, [r1,:64], r2
     bgt             5b
 
-    pop             {r4-r6, pc}
+    vpop            {d8-d11}
+    pop             {r4-r8, pc}
+
 .endfunc
 
 
index d69514cd2cf70d71987b74b7e1eb4064368ded45..2168120f711a00e6355577959687e24a3c11d16f 100644 (file)
@@ -238,7 +238,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->offsetsub = x264_mc_offsetsub_wtab_neon;
     pf->weight_cache = x264_weight_cache_neon;
 
-//  pf->mc_chroma = x264_mc_chroma_neon;
+    pf->mc_chroma = x264_mc_chroma_neon;
     pf->mc_luma = mc_luma_neon;
     pf->get_ref = get_ref_neon;
     pf->hpel_filter = hpel_filter_neon;