From: Martin Storsjö <martin@martin.st>
Date: Thu, 13 Aug 2015 20:59:24 +0000 (+0300)
Subject: arm: Simplify x264_predict_8x8c_p_neon
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3f89a6bbee061cb0361770cf5b8495448515a011;p=libx264

arm: Simplify x264_predict_8x8c_p_neon

This gets rid of a few unnecessary (and confusing) steps in
calculating the increment to i00.

checkasm timing      Cortex-A7    A8    A9
intra_predict_8x8c_p_c      5525  4732  4755
intra_predict_8x8c_p_neon   1719  1140  1262  (before)
intra_predict_8x8c_p_neon   1663  1142  1255  (after)
---

diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index 3343144b..7e5d9d33 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -535,17 +535,12 @@ function x264_predict_8x8c_p_neon
     vadd.i16    d16, d16, d0
     vshl.i16    d2,  d16, #4
     vsub.i16    d2,  d2,  d3
-    vshl.i16    d3,  d4,  #3
     vext.16     q0,  q0,  q0,  #7
-    vsub.i16    d6,  d5,  d3
     vmov.16     d0[0], r3
     vmul.i16    q0,  q0,  d4[0]
     vdup.16     q1,  d2[0]
-    vdup.16     q2,  d4[0]
-    vdup.16     q3,  d6[0]
-    vshl.i16    q2,  q2,  #3
+    vdup.16     q3,  d5[0]
     vadd.i16    q1,  q1,  q0
-    vadd.i16    q3,  q3,  q2
     mov         r3,  #8
 1:
     vqshrun.s16 d0,  q1,  #5