From b8d7b8acb48b45afbfd7efb5baac79475682684a Mon Sep 17 00:00:00 2001 From: George Stephanos Date: Thu, 1 Dec 2011 16:53:45 -0800 Subject: [PATCH] More ARM NEON assembly functions predict_8x8_v, predict_4x4_dc_top, predict_8x8_ddl, predict_8x8_ddr, predict_8x8_vl, predict_8x8_vr, predict_8x8_hd, predict_8x8_hu. From Google Code-In. --- common/arm/predict-a.S | 213 ++++++++++++++++++++++++++++++++++++++++- common/arm/predict-c.c | 16 ++++ 2 files changed, 228 insertions(+), 1 deletion(-) diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S index 574653ea..3e9ed61c 100644 --- a/common/arm/predict-a.S +++ b/common/arm/predict-a.S @@ -102,6 +102,21 @@ function x264_predict_4x4_dc_armv6 bx lr .endfunc +function x264_predict_4x4_dc_top_neon + mov r12, #FDEC_STRIDE + sub r1, r0, #FDEC_STRIDE + vld1.32 d1[], [r1,:32] + vpaddl.u8 d1, d1 + vpadd.u16 d1, d1, d1 + vrshr.u16 d1, d1, #2 + vdup.8 d1, d1[0] + vst1.32 d1[0], [r0,:32], r12 + vst1.32 d1[0], [r0,:32], r12 + vst1.32 d1[0], [r0,:32], r12 + vst1.32 d1[0], [r0,:32], r12 + bx lr +.endfunc + // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 uhadd8 \a1, \a1, \c1 @@ -211,6 +226,202 @@ function x264_predict_8x8_h_neon bx lr .endfunc +function x264_predict_8x8_v_neon + add r1, r1, #16 + mov r12, #FDEC_STRIDE + vld1.8 {d0}, [r1,:64] +.rept 8 + vst1.8 {d0}, [r0,:64], r12 +.endr + bx lr +.endfunc + +function x264_predict_8x8_ddl_neon + add r1, #16 + vld1.8 {d0, d1}, [r1,:128] + vmov.i8 q3, #0 + vrev64.8 d2, d1 + vext.8 q8, q3, q0, #15 + vext.8 q2, q0, q1, #1 + vhadd.u8 q8, q2 + mov r12, #FDEC_STRIDE + vrhadd.u8 q0, q8 + vext.8 d2, d0, d1, #1 + vext.8 d3, d0, d1, #2 + vst1.8 d2, [r0,:64], r12 + vext.8 d2, d0, d1, #3 + vst1.8 d3, [r0,:64], r12 + vext.8 d3, d0, d1, #4 + vst1.8 d2, [r0,:64], r12 + vext.8 d2, d0, d1, #5 + vst1.8 d3, [r0,:64], r12 + vext.8 d3, d0, d1, #6 + vst1.8 d2, [r0,:64], r12 + vext.8 d2, d0, d1, #7 + vst1.8 d3, [r0,:64], r12 + vst1.8 d2, [r0,:64], r12 + vst1.8 d1, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_ddr_neon + vld1.8 {d0-d3}, [r1,:128] + vext.8 q2, q0, q1, #7 + vext.8 q3, q0, q1, #9 + + vhadd.u8 q2, q2, q3 + vrhadd.u8 d0, d1, d4 + vrhadd.u8 d1, d2, d5 + + add r0, #7*FDEC_STRIDE + mov r12, #-1*FDEC_STRIDE + + vext.8 d2, d0, d1, #1 + vst1.8 {d0}, [r0,:64], r12 + vext.8 d4, d0, d1, #2 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d5, d0, d1, #3 + vst1.8 {d4}, [r0,:64], r12 + vext.8 d4, d0, d1, #4 + vst1.8 {d5}, [r0,:64], r12 + vext.8 d5, d0, d1, #5 + vst1.8 {d4}, [r0,:64], r12 + vext.8 d4, d0, d1, #6 + vst1.8 {d5}, [r0,:64], r12 + vext.8 d5, d0, d1, #7 + vst1.8 {d4}, [r0,:64], r12 + vst1.8 {d5}, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_vl_neon + add r1, #16 + mov r12, #FDEC_STRIDE + + vld1.8 {d0, d1}, [r1,:128] + vext.8 q1, q1, q0, #15 + vext.8 q2, q0, q2, #1 + + vrhadd.u8 q3, q0, q2 + + vhadd.u8 q1, q1, q2 + vrhadd.u8 q0, q0, q1 + + vext.8 d2, d0, d1, #1 + vst1.8 {d6}, [r0,:64], r12 + vext.8 d3, d6, d7, #1 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #2 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d3, d6, d7, #2 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #3 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d3, d6, d7, #3 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #4 + vst1.8 {d3}, [r0,:64], r12 + vst1.8 {d2}, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_vr_neon + add r1, #8 + mov r12, #FDEC_STRIDE + vld1.8 {d4,d5}, [r1,:64] + + vext.8 q1, q2, q2, #14 + vext.8 q0, q2, q2, #15 + + vhadd.u8 q3, q2, q1 + vrhadd.u8 q2, q2, q0 + vrhadd.u8 q0, q0, q3 + + vmov d2, d0 + + vst1.8 {d5}, [r0,:64], r12 + vuzp.8 d2, d0 + vst1.8 {d1}, [r0,:64], r12 + vext.8 d6, d0, d5, #7 + vext.8 d3, d2, d1, #7 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d6, d0, d5, #6 + vext.8 d3, d2, d1, #6 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d3}, [r0,:64], r12 + vext.8 d6, d0, d5, #5 + vext.8 d3, d2, d1, #5 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d3}, [r0,:64], r12 + bx lr +.endfunc + +function x264_predict_8x8_hd_neon + mov r12, #FDEC_STRIDE + add r1, #7 + + vld1.8 {d2,d3}, [r1] + vext.8 q3, q1, q1, #1 + vext.8 q2, q1, q1, #2 + + vrhadd.u8 q8, q1, q3 + + vhadd.u8 q1, q2 + vrhadd.u8 q0, q1, q3 + + vzip.8 d16, d0 + + vext.8 d2, d0, d1, #6 + vext.8 d3, d0, d1, #4 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d0, d1, #2 + vst1.8 {d3}, [r0,:64], r12 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d16, d0, #6 + vst1.8 {d0}, [r0,:64], r12 + vext.8 d3, d16, d0, #4 + vst1.8 {d2}, [r0,:64], r12 + vext.8 d2, d16, d0, #2 + vst1.8 {d3}, [r0,:64], r12 + vst1.8 {d2}, [r0,:64], r12 + vst1.8 {d16}, [r0,:64], r12 + + bx lr +.endfunc + +function x264_predict_8x8_hu_neon + mov r12, #FDEC_STRIDE + add r1, #7 + vld1.8 {d7}, [r1] + vdup.8 d6, d7[0] + vrev64.8 d7, d7 + + vext.8 d4, d7, d6, #2 + vext.8 d2, d7, d6, #1 + + vhadd.u8 d16, d7, d4 + vrhadd.u8 d0, d2, d7 + vrhadd.u8 d1, d16, d2 + + vzip.8 d0, d1 + + vdup.16 q1, d1[3] + + vext.8 q2, q0, q1, #2 + vext.8 q3, q0, q1, #4 + vext.8 q8, q0, q1, #6 + vst1.8 {d0}, [r0,:64], r12 + vst1.8 {d4}, [r0,:64], r12 + vst1.8 {d6}, [r0,:64], r12 + vst1.8 {d16}, [r0,:64], r12 + + vst1.8 {d1}, [r0,:64], r12 + vst1.8 {d5}, [r0,:64], r12 + vst1.8 {d7}, [r0,:64], r12 + vst1.8 {d17}, [r0,:64] + bx lr +.endfunc function x264_predict_8x8c_dc_top_neon sub r2, r0, #FDEC_STRIDE @@ -223,7 +434,7 @@ function x264_predict_8x8c_dc_top_neon vdup.8 d0, d0[0] vtrn.32 d0, d1 b pred8x8_dc_end - .endfunc +.endfunc function x264_predict_8x8c_dc_left_neon mov r1, #FDEC_STRIDE diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c index bccdc504..bf8fd38a 100644 --- a/common/arm/predict-c.c +++ b/common/arm/predict-c.c @@ -28,6 +28,7 @@ #include "pixel.h" void x264_predict_4x4_dc_armv6( uint8_t *src ); +void x264_predict_4x4_dc_top_neon( uint8_t *src ); void x264_predict_4x4_h_armv6( uint8_t *src ); void x264_predict_4x4_ddr_armv6( uint8_t *src ); void x264_predict_4x4_ddl_neon( uint8_t *src ); @@ -40,7 +41,14 @@ void x264_predict_8x8c_v_neon( uint8_t *src ); void x264_predict_8x8c_p_neon( uint8_t *src ); void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); void x264_predict_16x16_dc_neon( uint8_t *src ); void x264_predict_16x16_dc_top_neon( uint8_t *src ); @@ -62,6 +70,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) if (!(cpu&X264_CPU_NEON)) return; + pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; #endif // !HIGH_BIT_DEPTH } @@ -87,8 +96,15 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_ return; #if !HIGH_BIT_DEPTH + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; + pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon; + pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon; + pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon; #endif // !HIGH_BIT_DEPTH } -- 2.40.0