From 2dcc6072d12deaf27705dc2beb63e192bd590232 Mon Sep 17 00:00:00 2001 From: David Conrad Date: Mon, 24 Aug 2009 00:58:42 -0700 Subject: [PATCH] GSOC merge part 6: ARM NEON quant assembly functions (partial) (de)quant 4x4, (de)quant 8x8, (de)quant DC, coeff_last --- Makefile | 2 +- common/arm/quant-a.S | 352 +++++++++++++++++++++++++++++++++++++++++++ common/arm/quant.h | 42 ++++++ common/quant.c | 22 +++ 4 files changed, 417 insertions(+), 1 deletion(-) create mode 100644 common/arm/quant-a.S create mode 100644 common/arm/quant.h diff --git a/Makefile b/Makefile index 5d93df27..7027cdf2 100644 --- a/Makefile +++ b/Makefile @@ -59,7 +59,7 @@ endif ifeq ($(ARCH),ARM) ifneq ($(AS),) ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \ - common/arm/dct-a.S + common/arm/dct-a.S common/arm/quant-a.S SRCS += common/arm/mc-c.c OBJASM = $(ASMSRC:%.S=%.o) endif diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S new file mode 100644 index 00000000..81ec1b18 --- /dev/null +++ b/common/arm/quant-a.S @@ -0,0 +1,352 @@ +/***************************************************************************** + * quant.S: h264 encoder + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "asm.S" + +.fpu neon + +.section .rodata +.align 4 +pmovmskb_byte: +.byte 1,2,4,8,16,32,64,128 +.byte 1,2,4,8,16,32,64,128 + +.text + +.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 load_mf=no + vadd.u16 q8, q8, \bias0 + vadd.u16 q9, q9, \bias1 +.ifc \load_mf, yes + vld1.64 {\mf0-\mf3}, [r1,:128]! +.endif + vmull.u16 q10, d16, \mf0 + vmull.u16 q11, d17, \mf1 + vmull.u16 q12, d18, \mf2 + vmull.u16 q13, d19, \mf3 + vshr.s16 q14, q14, #15 + vshr.s16 q15, q15, #15 + vshrn.u32 d16, q10, #16 + vshrn.u32 d17, q11, #16 + vshrn.u32 d18, q12, #16 + vshrn.u32 d19, q13, #16 + veor q8, q8, q14 + veor q9, q9, q15 + vsub.s16 q8, q8, q14 + vsub.s16 q9, q9, q15 + vorr \bias0, q8, q9 + vst1.64 {d16-d19}, [r0,:128]! +.endm + +.macro QUANT_END d + vmov r2, r3, \d + orrs r0, r2, r3 + movne r0, #1 + bx lr +.endm + +// quant_2x2_dc( int16_t dct[2][2], int mf, int bias ) +function x264_quant_2x2_dc_neon, export=1 + vld1.64 {d0}, [r0,:64] + vabs.s16 d3, d0 + vdup.16 d2, r2 + vdup.16 d1, r1 + vadd.u16 d3, d3, d2 + vmull.u16 q3, d3, d1 + vshr.s16 d0, d0, #15 + vshrn.u32 d3, q3, #16 + veor d3, d3, d0 + vsub.s16 d3, d3, d0 + vst1.64 {d3}, [r0,:64] + QUANT_END d3 +.endfunc + +// quant_4x4_dc( int16_t dct[4][4], int mf, int bias ) +function x264_quant_4x4_dc_neon, export=1 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + vdup.16 q0, r2 + vdup.16 q2, r1 + QUANT_TWO q0, q0, d4, d5, d4, d5 + vorr d0, d0, d1 + QUANT_END d0 +.endfunc + +// quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ) +function x264_quant_4x4_neon, export=1 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + vld1.64 {d0-d3}, [r2,:128] + vld1.64 {d4-d7}, [r1,:128] + QUANT_TWO q0, q1, d4, d5, d6, d7 + vorr d0, d0, d1 + QUANT_END d0 +.endfunc + +// quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ) +function x264_quant_8x8_neon, export=1 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + vld1.64 {d0-d3}, [r2,:128]! + vld1.64 {d4-d7}, [r1,:128]! + QUANT_TWO q0, q1, d4, d5, d6, d7 +.rept 3 + vld1.64 {d28-d31}, [r0,:128] + vabs.s16 q8, q14 + vabs.s16 q9, q15 + vld1.64 {d2-d5}, [r2,:128]! + QUANT_TWO q1, q2, d4, d5, d6, d7, yes + vorr q0, q0, q1 +.endr + vorr d0, d0, d1 + QUANT_END d0 +.endfunc + +.macro DEQUANT_START mf_size offset dc=no + movw r3, #0x2b + mul r3, r3, r2 + lsr r3, r3, #8 // i_qbits = i_qp / 6 + add ip, r3, r3, lsl #1 + sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6 +.ifc \dc,no + add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf] +.else + ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0] +.endif + subs r3, r3, #\offset // 6 for 8x8 +.endm + +// dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +.macro DEQUANT size bits +function x264_dequant_\size\()_neon, export=1 + DEQUANT_START \bits+2, \bits +.ifc \size, 8x8 + mov r2, #4 +.endif + blt dequant_\size\()_rshift + + vdup.16 q15, r3 +dequant_\size\()_lshift_loop: +.ifc \size, 8x8 + subs r2, r2, #1 +.endif + vld1.32 {d16-d17}, [r1,:128]! + vld1.32 {d18-d19}, [r1,:128]! + vmovn.s32 d4, q8 + vld1.32 {d20-d21}, [r1,:128]! + vmovn.s32 d5, q9 + vld1.32 {d22-d23}, [r1,:128]! + vmovn.s32 d6, q10 + vld1.16 {d0-d3}, [r0,:128] + vmovn.s32 d7, q11 + vmul.s16 q0, q0, q2 + vmul.s16 q1, q1, q3 + vshl.s16 q0, q0, q15 + vshl.s16 q1, q1, q15 + vst1.16 {d0-d3}, [r0,:128]! +.ifc \size, 8x8 + bgt dequant_\size\()_lshift_loop +.endif + bx lr + +dequant_\size\()_rshift: + vdup.32 q15, r3 + rsb r3, r3, #0 + mov ip, #1 + sub r3, r3, #1 + lsl ip, ip, r3 + +.ifc \size, 8x8 +dequant_\size\()_rshift_loop: + subs r2, r2, #1 +.endif + vdup.32 q10, ip + vld1.32 {d16-d17}, [r1,:128]! + vdup.32 q11, ip + vld1.32 {d18-d19}, [r1,:128]! + vmovn.s32 d4, q8 + vld1.32 {d16-d17}, [r1,:128]! + vmovn.s32 d5, q9 + vld1.32 {d18-d19}, [r1,:128]! + vmovn.s32 d6, q8 + vld1.16 {d0-d3}, [r0,:128] + vmovn.s32 d7, q9 + vdup.32 q12, ip + vdup.32 q13, ip + + vmlal.s16 q10, d0, d4 + vmlal.s16 q11, d1, d5 + vmlal.s16 q12, d2, d6 + vmlal.s16 q13, d3, d7 + vshl.s32 q10, q10, q15 + vshl.s32 q11, q11, q15 + vshl.s32 q12, q12, q15 + vshl.s32 q13, q13, q15 + + vmovn.s32 d0, q10 + vmovn.s32 d1, q11 + vmovn.s32 d2, q12 + vmovn.s32 d3, q13 + vst1.16 {d0-d3}, [r0,:128]! +.ifc \size, 8x8 + bgt dequant_\size\()_rshift_loop +.endif + bx lr +.endfunc +.endm + +DEQUANT 4x4, 4 +DEQUANT 8x8, 6 + +// dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +function x264_dequant_4x4_dc_neon, export=1 + DEQUANT_START 6, 6, yes + blt dequant_4x4_dc_rshift + + lsl r1, r1, r3 + vdup.16 q2, r1 + vld1.16 {d0-d3}, [r0,:128] + vdup.16 q15, r3 + + vmul.s16 q0, q0, q2 + vmul.s16 q1, q1, q2 + vst1.16 {d0-d3}, [r0,:128] + bx lr + +dequant_4x4_dc_rshift: + vdup.16 d4, r1 + vdup.32 q15, r3 + rsb r3, r3, #0 + mov ip, #1 + sub r3, r3, #1 + lsl ip, ip, r3 + + vdup.32 q10, ip + vdup.32 q11, ip + vld1.16 {d0-d3}, [r0,:128] + vdup.32 q12, ip + vdup.32 q13, ip + + vmlal.s16 q10, d0, d4 + vmlal.s16 q11, d1, d4 + vmlal.s16 q12, d2, d4 + vmlal.s16 q13, d3, d4 + vshl.s32 q10, q10, q15 + vshl.s32 q11, q11, q15 + vshl.s32 q12, q12, q15 + vshl.s32 q13, q13, q15 + + vmovn.s32 d0, q10 + vmovn.s32 d1, q11 + vmovn.s32 d2, q12 + vmovn.s32 d3, q13 + vst1.16 {d0-d3}, [r0,:128] + bx lr +.endfunc + + +// int coeff_last( int16_t *l ) +function x264_coeff_last4_arm, export=1 + ldrd r2, [r0] + subs r0, r3, #0 + movne r0, #2 + movne r2, r3 + lsrs r2, r2, #16 + addne r0, r0, #1 + bx lr +.endfunc + +.macro COEFF_LAST_1x size +function x264_coeff_last\size\()_neon, export=1 +.if \size == 15 + sub r0, r0, #2 + vld1.64 {d0-d3}, [r0] +.else + vld1.64 {d0-d3}, [r0,:128] +.endif + vtst.16 q0, q0 + vtst.16 q1, q1 + vshrn.u16 d0, q0, #8 + vshrn.u16 d1, q1, #8 + vshrn.u16 d0, q0, #4 + vclz.i32 d0, d0 + mov ip, #7 + mov r3, #\size - 9 + vmov r0, r1, d0 + + subs r1, ip, r1, lsr #2 + addge r0, r1, #\size - 8 + sublts r0, r3, r0, lsr #2 + movlt r0, #0 + bx lr +.endfunc +.endm + +COEFF_LAST_1x 15 +COEFF_LAST_1x 16 + +function x264_coeff_last64_neon, export=1 + vld1.64 {d16-d19}, [r0,:128]! + vqmovn.u16 d16, q8 + vqmovn.u16 d17, q9 + vld1.64 {d20-d23}, [r0,:128]! + vqmovn.u16 d18, q10 + vqmovn.u16 d19, q11 + vld1.64 {d24-d27}, [r0,:128]! + vqmovn.u16 d20, q12 + vqmovn.u16 d21, q13 + vld1.64 {d28-d31}, [r0,:128]! + vqmovn.u16 d22, q14 + vqmovn.u16 d23, q15 + + movrel r1, pmovmskb_byte + vld1.64 {d0-d1}, [r1,:128] + + vtst.8 q8, q8 + vtst.8 q9, q9 + vtst.8 q10, q10 + vtst.8 q11, q11 + + vand q8, q8, q0 + vand q9, q9, q0 + vand q10, q10, q0 + vand q11, q11, q0 + + vpadd.u8 d0, d16, d17 + vpadd.u8 d1, d18, d19 + vpadd.u8 d2, d20, d21 + vpadd.u8 d3, d22, d23 + vpadd.u8 d0, d0, d1 + vpadd.u8 d1, d2, d3 + vpadd.u8 d0, d0, d1 + vclz.i32 d0, d0 + mov ip, #31 + vmov r0, r1, d0 + + subs r1, ip, r1 + addge r0, r1, #32 + sublts r0, ip, r0 + movlt r0, #0 + bx lr +.endfunc diff --git a/common/arm/quant.h b/common/arm/quant.h new file mode 100644 index 00000000..0df9ebf1 --- /dev/null +++ b/common/arm/quant.h @@ -0,0 +1,42 @@ +/***************************************************************************** + * quant.h: h264 encoder library + ***************************************************************************** + * Copyright (C) 2005-2008 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef X264_ARM_QUANT_H +#define X264_ARM_QUANT_H + +int x264_quant_2x2_dc_armv6( int16_t dct[2][2], int mf, int bias ); + +int x264_quant_2x2_dc_neon( int16_t dct[2][2], int mf, int bias ); +int x264_quant_4x4_dc_neon( int16_t dct[4][4], int mf, int bias ); +int x264_quant_4x4_neon( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_8x8_neon( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] ); + +void x264_dequant_4x4_dc_neon( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); +void x264_dequant_4x4_neon( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); +void x264_dequant_8x8_neon( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); + +int x264_coeff_last4_arm( int16_t * ); +int x264_coeff_last15_neon( int16_t * ); +int x264_coeff_last16_neon( int16_t * ); +int x264_coeff_last64_neon( int16_t * ); + +#endif diff --git a/common/quant.c b/common/quant.c index daf2b5a2..263fb7c1 100644 --- a/common/quant.c +++ b/common/quant.c @@ -29,6 +29,9 @@ #ifdef ARCH_PPC # include "ppc/quant.h" #endif +#ifdef ARCH_ARM +# include "arm/quant.h" +#endif #define QUANT_ONE( coef, mf, f ) \ { \ @@ -428,6 +431,25 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_8x8 = x264_dequant_8x8_altivec; } #endif + +#ifdef HAVE_ARMV6 + if( cpu&X264_CPU_ARMV6 ) + pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm; + + if( cpu&X264_CPU_NEON ) + { + pf->quant_2x2_dc = x264_quant_2x2_dc_neon; + pf->quant_4x4 = x264_quant_4x4_neon; + pf->quant_4x4_dc = x264_quant_4x4_dc_neon; + pf->quant_8x8 = x264_quant_8x8_neon; + pf->dequant_4x4 = x264_dequant_4x4_neon; + pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon; + pf->dequant_8x8 = x264_dequant_8x8_neon; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; + } +#endif pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4]; pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC]; pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4]; -- 2.40.0