From: Martin Storsjö Date: Thu, 13 Aug 2015 20:59:24 +0000 (+0300) Subject: arm: Simplify x264_predict_8x8c_p_neon X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3f89a6bbee061cb0361770cf5b8495448515a011;p=libx264 arm: Simplify x264_predict_8x8c_p_neon This gets rid of a few unnecessary (and confusing) steps in calculating the increment to i00. checkasm timing Cortex-A7 A8 A9 intra_predict_8x8c_p_c 5525 4732 4755 intra_predict_8x8c_p_neon 1719 1140 1262 (before) intra_predict_8x8c_p_neon 1663 1142 1255 (after) --- diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S index 3343144b..7e5d9d33 100644 --- a/common/arm/predict-a.S +++ b/common/arm/predict-a.S @@ -535,17 +535,12 @@ function x264_predict_8x8c_p_neon vadd.i16 d16, d16, d0 vshl.i16 d2, d16, #4 vsub.i16 d2, d2, d3 - vshl.i16 d3, d4, #3 vext.16 q0, q0, q0, #7 - vsub.i16 d6, d5, d3 vmov.16 d0[0], r3 vmul.i16 q0, q0, d4[0] vdup.16 q1, d2[0] - vdup.16 q2, d4[0] - vdup.16 q3, d6[0] - vshl.i16 q2, q2, #3 + vdup.16 q3, d5[0] vadd.i16 q1, q1, q0 - vadd.i16 q3, q3, q2 mov r3, #8 1: vqshrun.s16 d0, q1, #5