From 50d7fb80d8cb773cd6d495e083867c3685726352 Mon Sep 17 00:00:00 2001 From: David Conrad Date: Mon, 24 Aug 2009 01:38:42 -0700 Subject: [PATCH] GSOC merge part 8: ARM NEON intra prediction assembly functions (partial) 4x4 dc/h/ddr/ddl, 8x8 dc/h, 8x8c h/v, 16x16 dc/h/v --- Makefile | 5 +- common/arm/predict-a.S | 272 +++++++++++++++++++++++++++++++++++++++++ common/arm/predict-c.c | 83 +++++++++++++ common/arm/predict.h | 31 +++++ common/predict.c | 19 +++ 5 files changed, 408 insertions(+), 2 deletions(-) create mode 100644 common/arm/predict-a.S create mode 100644 common/arm/predict-c.c create mode 100644 common/arm/predict.h diff --git a/Makefile b/Makefile index 5a0938d2..2e5e2bd6 100644 --- a/Makefile +++ b/Makefile @@ -59,8 +59,9 @@ endif ifeq ($(ARCH),ARM) ifneq ($(AS),) ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \ - common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S -SRCS += common/arm/mc-c.c + common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \ + common/arm/predict-a.S +SRCS += common/arm/mc-c.c common/arm/predict-c.c OBJASM = $(ASMSRC:%.S=%.o) endif endif diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S new file mode 100644 index 00000000..46e687b3 --- /dev/null +++ b/common/arm/predict-a.S @@ -0,0 +1,272 @@ +/***************************************************************************** + * predict_armv6.S: h264 encoder + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "asm.S" + +.fpu neon + +.section .rodata +.align 4 + +pw_76543210: .short 7,6,5,4,3,2,1,0 + +.text + +// because gcc doesn't believe in using the free shift in add +function x264_predict_4x4_h_armv6, export=1 + ldrb r1, [r0, #0*FDEC_STRIDE-1] + ldrb r2, [r0, #1*FDEC_STRIDE-1] + ldrb r3, [r0, #2*FDEC_STRIDE-1] + ldrb ip, [r0, #3*FDEC_STRIDE-1] + add r1, r1, r1, lsl #8 + add r2, r2, r2, lsl #8 + add r3, r3, r3, lsl #8 + add ip, ip, ip, lsl #8 + add r1, r1, r1, lsl #16 + str r1, [r0, #0*FDEC_STRIDE] + add r2, r2, r2, lsl #16 + str r2, [r0, #1*FDEC_STRIDE] + add r3, r3, r3, lsl #16 + str r3, [r0, #2*FDEC_STRIDE] + add ip, ip, ip, lsl #16 + str ip, [r0, #3*FDEC_STRIDE] + bx lr +.endfunc + +function x264_predict_4x4_dc_armv6, export=1 + mov ip, #0 + ldr r1, [r0, #-FDEC_STRIDE] + ldrb r2, [r0, #0*FDEC_STRIDE-1] + ldrb r3, [r0, #1*FDEC_STRIDE-1] + usad8 r1, r1, ip + add r2, r2, #4 + ldrb ip, [r0, #2*FDEC_STRIDE-1] + add r2, r2, r3 + ldrb r3, [r0, #3*FDEC_STRIDE-1] + add r2, r2, ip + add r2, r2, r3 + add r1, r1, r2 + lsr r1, r1, #3 + add r1, r1, r1, lsl #8 + add r1, r1, r1, lsl #16 + str r1, [r0, #0*FDEC_STRIDE] + str r1, [r0, #1*FDEC_STRIDE] + str r1, [r0, #2*FDEC_STRIDE] + str r1, [r0, #3*FDEC_STRIDE] + bx lr +.endfunc + +// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2 +.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 + uhadd8 \a1, \a1, \c1 + uhadd8 \a2, \a2, \c2 + uhadd8 \c1, \a1, \b1 + uhadd8 \c2, \a2, \b2 + eor \a1, \a1, \b1 + eor \a2, \a2, \b2 + and \a1, \a1, \pb_1 + and \a2, \a2, \pb_1 + uadd8 \a1, \a1, \c1 + uadd8 \a2, \a2, \c2 +.endm + +function x264_predict_4x4_ddr_armv6, export=1 + ldr r1, [r0, # -FDEC_STRIDE] + ldrb r2, [r0, # -FDEC_STRIDE-1] + ldrb r3, [r0, #0*FDEC_STRIDE-1] + push {r4-r6,lr} + add r2, r2, r1, lsl #8 + ldrb r4, [r0, #1*FDEC_STRIDE-1] + add r3, r3, r2, lsl #8 + ldrb r5, [r0, #2*FDEC_STRIDE-1] + ldrb r6, [r0, #3*FDEC_STRIDE-1] + add r4, r4, r3, lsl #8 + add r5, r5, r4, lsl #8 + add r6, r6, r5, lsl #8 + ldr ip, pb_1 + PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip + str r1, [r0, #0*FDEC_STRIDE] + lsl r2, r1, #8 + lsl r3, r1, #16 + lsl r4, r4, #8 + lsl r5, r1, #24 + add r2, r2, r4, lsr #24 + str r2, [r0, #1*FDEC_STRIDE] + add r3, r3, r4, lsr #16 + str r3, [r0, #2*FDEC_STRIDE] + add r5, r5, r4, lsr #8 + str r5, [r0, #3*FDEC_STRIDE] + pop {r4-r6,pc} +.endfunc + +pb_1: .word 0x01010101 + +function x264_predict_4x4_ddl_neon, export=1 + sub r0, #FDEC_STRIDE + mov ip, #FDEC_STRIDE + vld1.64 {d0}, [r0], ip + vdup.8 d3, d0[7] + vext.8 d1, d0, d0, #1 + vext.8 d2, d0, d3, #2 + vhadd.u8 d0, d0, d2 + vrhadd.u8 d0, d0, d1 + vst1.32 {d0[0]}, [r0,:32], ip + vext.8 d1, d0, d0, #1 + vext.8 d2, d0, d0, #2 + vst1.32 {d1[0]}, [r0,:32], ip + vext.8 d3, d0, d0, #3 + vst1.32 {d2[0]}, [r0,:32], ip + vst1.32 {d3[0]}, [r0,:32], ip + bx lr +.endfunc + +function x264_predict_8x8_dc_neon, export=1 + mov ip, #0 + ldrd r2, [r1, #8] + push {r4-r5,lr} + ldrd r4, [r1, #16] + lsl r3, r3, #8 + ldrb lr, [r1, #7] + usad8 r2, r2, ip + usad8 r3, r3, ip + usada8 r2, r4, ip, r2 + add lr, lr, #8 + usada8 r3, r5, ip, r3 + add r2, r2, lr + mov ip, #FDEC_STRIDE + add r2, r2, r3 + lsr r2, r2, #4 + + vdup.8 d0, r2 +.rept 8 + vst1.64 {d0}, [r0,:64], ip +.endr + pop {r4-r5,pc} +.endfunc + + +function x264_predict_8x8_h_neon, export=1 + add r1, r1, #7 + mov ip, #FDEC_STRIDE + vld1.64 {d16}, [r1] + vdup.8 d0, d16[7] + vdup.8 d1, d16[6] + vst1.64 {d0}, [r0,:64], ip + vdup.8 d2, d16[5] + vst1.64 {d1}, [r0,:64], ip + vdup.8 d3, d16[4] + vst1.64 {d2}, [r0,:64], ip + vdup.8 d4, d16[3] + vst1.64 {d3}, [r0,:64], ip + vdup.8 d5, d16[2] + vst1.64 {d4}, [r0,:64], ip + vdup.8 d6, d16[1] + vst1.64 {d5}, [r0,:64], ip + vdup.8 d7, d16[0] + vst1.64 {d6}, [r0,:64], ip + vst1.64 {d7}, [r0,:64], ip + bx lr +.endfunc + +function x264_predict_8x8c_h_neon, export=1 + sub r1, r0, #1 + mov ip, #FDEC_STRIDE +.rept 4 + vld1.8 {d0[]}, [r1], ip + vld1.8 {d2[]}, [r1], ip + vst1.64 {d0}, [r0,:64], ip + vst1.64 {d2}, [r0,:64], ip +.endr + bx lr +.endfunc + +function x264_predict_8x8c_v_neon, export=1 + sub r0, r0, #FDEC_STRIDE + mov ip, #FDEC_STRIDE + vld1.64 {d0}, [r0,:64], ip +.rept 8 + vst1.64 {d0}, [r0,:64], ip +.endr + bx lr +.endfunc + + +function x264_predict_16x16_dc_neon, export=1 + sub r3, r0, #FDEC_STRIDE + sub r0, r0, #1 + vld1.64 {d0-d1}, [r3,:128] + ldrb ip, [r0], #FDEC_STRIDE + vaddl.u8 q0, d0, d1 + ldrb r1, [r0], #FDEC_STRIDE + vadd.u16 d0, d0, d1 + vpadd.u16 d0, d0, d0 + vpadd.u16 d0, d0, d0 +.rept 4 + ldrb r2, [r0], #FDEC_STRIDE + add ip, ip, r1 + ldrb r3, [r0], #FDEC_STRIDE + add ip, ip, r2 + ldrb r1, [r0], #FDEC_STRIDE + add ip, ip, r3 +.endr + ldrb r2, [r0], #FDEC_STRIDE + add ip, ip, r1 + ldrb r3, [r0], #FDEC_STRIDE + add ip, ip, r2 + + sub r0, r0, #FDEC_STRIDE*16 + add ip, ip, r3 + vdup.16 d1, ip + vadd.u16 d0, d0, d1 + mov ip, #FDEC_STRIDE + add r0, r0, #1 + vrshr.u16 d0, d0, #5 + vdup.8 q0, d0[0] +.rept 16 + vst1.64 {d0-d1}, [r0,:64], ip +.endr + bx lr +.endfunc + +function x264_predict_16x16_h_neon, export=1 + sub r1, r0, #1 + mov ip, #FDEC_STRIDE +.rept 8 + vld1.8 {d0[]}, [r1], ip + vmov d1, d0 + vld1.8 {d2[]}, [r1], ip + vmov d3, d2 + vst1.64 {d0-d1}, [r0,:128], ip + vst1.64 {d2-d3}, [r0,:128], ip +.endr + bx lr +.endfunc + +function x264_predict_16x16_v_neon, export=1 + sub r0, r0, #FDEC_STRIDE + mov ip, #FDEC_STRIDE + vld1.64 {d0-d1}, [r0,:128], ip +.rept 16 + vst1.64 {d0-d1}, [r0,:128], ip +.endr + bx lr +.endfunc diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c new file mode 100644 index 00000000..1f2cd52a --- /dev/null +++ b/common/arm/predict-c.c @@ -0,0 +1,83 @@ +/***************************************************************************** + * predict.c: h264 encoder + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "common/common.h" +#include "predict.h" +#include "pixel.h" + +void x264_predict_4x4_dc_armv6( uint8_t *src ); +void x264_predict_4x4_h_armv6( uint8_t *src ); +void x264_predict_4x4_ddr_armv6( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); + +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[33] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[33] ); + +void x264_predict_16x16_dc_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); + +void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) +{ + if (!(cpu&X264_CPU_ARMV6)) + return; + + pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6; + pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6; + pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6; + + if (!(cpu&X264_CPU_NEON)) + return; + + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; +} + +void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon; +} + +void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + + pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; + pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; +} + +void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + + pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; + pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; + pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; +} diff --git a/common/arm/predict.h b/common/arm/predict.h new file mode 100644 index 00000000..fe5ccda8 --- /dev/null +++ b/common/arm/predict.h @@ -0,0 +1,31 @@ +/***************************************************************************** + * predict.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef X264_ARM_PREDICT_H +#define X264_ARM_PREDICT_H + +void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ); +void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); +void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ); +void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] ); + +#endif diff --git a/common/predict.c b/common/predict.c index fd07ae2e..385eb5ce 100644 --- a/common/predict.c +++ b/common/predict.c @@ -33,6 +33,9 @@ #ifdef ARCH_PPC # include "ppc/predict.h" #endif +#ifdef ARCH_ARM +# include "arm/predict.h" +#endif /**************************************************************************** * 16x16 prediction for intra luma block @@ -770,6 +773,10 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] ) x264_predict_16x16_init_altivec( pf ); } #endif + +#ifdef HAVE_ARMV6 + x264_predict_16x16_init_arm( cpu, pf ); +#endif } void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) @@ -792,6 +799,10 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) x264_predict_8x8c_init_altivec( pf ); } #endif + +#ifdef HAVE_ARMV6 + x264_predict_8x8c_init_arm( cpu, pf ); +#endif } void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) @@ -813,6 +824,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_ #ifdef HAVE_MMX x264_predict_8x8_init_mmx( cpu, pf, predict_filter ); #endif + +#ifdef HAVE_ARMV6 + x264_predict_8x8_init_arm( cpu, pf, predict_filter ); +#endif } void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] ) @@ -833,5 +848,9 @@ void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] ) #ifdef HAVE_MMX x264_predict_4x4_init_mmx( cpu, pf ); #endif + +#ifdef HAVE_ARMV6 + x264_predict_4x4_init_arm( cpu, pf ); +#endif } -- 2.40.0