From: Johann Date: Mon, 20 Oct 2014 23:03:28 +0000 (-0700) Subject: vp8 quantization -> intrinsics X-Git-Tag: v1.4.0~533^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=7ae75c3d525d79b9b28652fb34082cf81a5de9ab;p=libvpx vp8 quantization -> intrinsics Use intrinsics for neon quantization. Slight loss (<5%) of performance compared to the assembly. Roughly 10x faster on arm64 because that was running C code before. Change-Id: I7cf5242d8f29b7eab5bca6a1c20c89c9fc9ca66d --- diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index c73ecf93f..88a5b5b09 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -454,16 +454,14 @@ add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *"; specialize qw/vp8_regular_quantize_b sse2 sse4_1/; add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/; +specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon/; $vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6; -$vp8_fast_quantize_b_neon_asm=vp8_fast_quantize_b_neon; add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"; # no asm yet add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"; -specialize qw/vp8_fast_quantize_b_pair neon_asm/; -$vp8_fast_quantize_b_pair_neon_asm=vp8_fast_quantize_b_pair_neon; +specialize qw/vp8_fast_quantize_b_pair neon/; add_proto qw/void vp8_quantize_mb/, "struct macroblock *"; specialize qw/vp8_quantize_mb neon/; diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm deleted file mode 100644 index 9374310e5..000000000 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.asm +++ /dev/null @@ -1,258 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_fast_quantize_b_neon| - EXPORT |vp8_fast_quantize_b_pair_neon| - - INCLUDE vp8_asm_enc_offsets.asm - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=4 - -;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2); -|vp8_fast_quantize_b_pair_neon| PROC - - stmfd sp!, {r4-r9} - vstmdb sp!, {q4-q7} - - ldr r4, [r0, #vp8_block_coeff] - ldr r5, [r0, #vp8_block_quant_fast] - ldr r6, [r0, #vp8_block_round] - - vld1.16 {q0, q1}, [r4@128] ; load z - - ldr r7, [r2, #vp8_blockd_qcoeff] - - vabs.s16 q4, q0 ; calculate x = abs(z) - vabs.s16 q5, q1 - - ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative - vshr.s16 q2, q0, #15 ; sz - vshr.s16 q3, q1, #15 - - vld1.s16 {q6, q7}, [r6@128] ; load round_ptr [0-15] - vld1.s16 {q8, q9}, [r5@128] ; load quant_ptr [0-15] - - ldr r4, [r1, #vp8_block_coeff] - - vadd.s16 q4, q6 ; x + Round - vadd.s16 q5, q7 - - vld1.16 {q0, q1}, [r4@128] ; load z2 - - vqdmulh.s16 q4, q8 ; y = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q5, q9 - - vabs.s16 q10, q0 ; calculate x2 = abs(z_2) - vabs.s16 q11, q1 - vshr.s16 q12, q0, #15 ; sz2 - vshr.s16 q13, q1, #15 - - ;modify data to have its original sign - veor.s16 q4, q2 ; y^sz - veor.s16 q5, q3 - - vadd.s16 q10, q6 ; x2 + Round - vadd.s16 q11, q7 - - ldr r8, [r2, #vp8_blockd_dequant] - - vqdmulh.s16 q10, q8 ; y2 = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q11, q9 - - vshr.s16 q4, #1 ; right shift 1 after vqdmulh - vshr.s16 q5, #1 - - vld1.s16 {q6, q7}, [r8@128] ;load dequant_ptr[i] - - vsub.s16 q4, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q5, q3 - - vshr.s16 q10, #1 ; right shift 1 after vqdmulh - vshr.s16 q11, #1 - - ldr r9, [r2, #vp8_blockd_dqcoeff] - - veor.s16 q10, q12 ; y2^sz2 - veor.s16 q11, q13 - - vst1.s16 {q4, q5}, [r7] ; store: qcoeff = x1 - - - vsub.s16 q10, q12 ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q11, q13 - - ldr r6, [r3, #vp8_blockd_qcoeff] - - vmul.s16 q2, q6, q4 ; x * Dequant - vmul.s16 q3, q7, q5 - - adr r0, inv_zig_zag ; load ptr of inverse zigzag table - - vceq.s16 q8, q8 ; set q8 to all 1 - - vst1.s16 {q10, q11}, [r6] ; store: qcoeff = x2 - - vmul.s16 q12, q6, q10 ; x2 * Dequant - vmul.s16 q13, q7, q11 - - vld1.16 {q6, q7}, [r0@128] ; load inverse scan order - - vtst.16 q14, q4, q8 ; now find eob - vtst.16 q15, q5, q8 ; non-zero element is set to all 1 - - vst1.s16 {q2, q3}, [r9] ; store dqcoeff = x * Dequant - - ldr r7, [r3, #vp8_blockd_dqcoeff] - - vand q0, q6, q14 ; get all valid numbers from scan array - vand q1, q7, q15 - - vst1.s16 {q12, q13}, [r7] ; store dqcoeff = x * Dequant - - vtst.16 q2, q10, q8 ; now find eob - vtst.16 q3, q11, q8 ; non-zero element is set to all 1 - - vmax.u16 q0, q0, q1 ; find maximum value in q0, q1 - - vand q10, q6, q2 ; get all valid numbers from scan array - vand q11, q7, q3 - vmax.u16 q10, q10, q11 ; find maximum value in q10, q11 - - vmax.u16 d0, d0, d1 - vmax.u16 d20, d20, d21 - vmovl.u16 q0, d0 - vmovl.u16 q10, d20 - - vmax.u32 d0, d0, d1 - vmax.u32 d20, d20, d21 - vpmax.u32 d0, d0, d0 - vpmax.u32 d20, d20, d20 - - ldr r4, [r2, #vp8_blockd_eob] - ldr r5, [r3, #vp8_blockd_eob] - - vst1.8 {d0[0]}, [r4] ; store eob - vst1.8 {d20[0]}, [r5] ; store eob - - vldmia sp!, {q4-q7} - ldmfd sp!, {r4-r9} - bx lr - - ENDP - -;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) -|vp8_fast_quantize_b_neon| PROC - - stmfd sp!, {r4-r7} - - ldr r3, [r0, #vp8_block_coeff] - ldr r4, [r0, #vp8_block_quant_fast] - ldr r5, [r0, #vp8_block_round] - - vld1.16 {q0, q1}, [r3@128] ; load z - vorr.s16 q14, q0, q1 ; check if all zero (step 1) - ldr r6, [r1, #vp8_blockd_qcoeff] - ldr r7, [r1, #vp8_blockd_dqcoeff] - vorr.s16 d28, d28, d29 ; check if all zero (step 2) - - vabs.s16 q12, q0 ; calculate x = abs(z) - vabs.s16 q13, q1 - - ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative - vshr.s16 q2, q0, #15 ; sz - vmov r2, r3, d28 ; check if all zero (step 3) - vshr.s16 q3, q1, #15 - - vld1.s16 {q14, q15}, [r5@128]; load round_ptr [0-15] - vld1.s16 {q8, q9}, [r4@128] ; load quant_ptr [0-15] - - vadd.s16 q12, q14 ; x + Round - vadd.s16 q13, q15 - - adr r0, inv_zig_zag ; load ptr of inverse zigzag table - - vqdmulh.s16 q12, q8 ; y = ((Round+abs(z)) * Quant) >> 16 - vqdmulh.s16 q13, q9 - - vld1.16 {q10, q11}, [r0@128]; load inverse scan order - - vceq.s16 q8, q8 ; set q8 to all 1 - - ldr r4, [r1, #vp8_blockd_dequant] - - vshr.s16 q12, #1 ; right shift 1 after vqdmulh - vshr.s16 q13, #1 - - ldr r5, [r1, #vp8_blockd_eob] - - orr r2, r2, r3 ; check if all zero (step 4) - cmp r2, #0 ; check if all zero (step 5) - beq zero_output ; check if all zero (step 6) - - ;modify data to have its original sign - veor.s16 q12, q2 ; y^sz - veor.s16 q13, q3 - - vsub.s16 q12, q2 ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement) - vsub.s16 q13, q3 - - vld1.s16 {q2, q3}, [r4@128] ; load dequant_ptr[i] - - vtst.16 q14, q12, q8 ; now find eob - vtst.16 q15, q13, q8 ; non-zero element is set to all 1 - - vst1.s16 {q12, q13}, [r6@128]; store: qcoeff = x1 - - vand q10, q10, q14 ; get all valid numbers from scan array - vand q11, q11, q15 - - - vmax.u16 q0, q10, q11 ; find maximum value in q0, q1 - vmax.u16 d0, d0, d1 - vmovl.u16 q0, d0 - - vmul.s16 q2, q12 ; x * Dequant - vmul.s16 q3, q13 - - vmax.u32 d0, d0, d1 - vpmax.u32 d0, d0, d0 - - vst1.s16 {q2, q3}, [r7@128] ; store dqcoeff = x * Dequant - - vst1.8 {d0[0]}, [r5] ; store eob - - ldmfd sp!, {r4-r7} - bx lr - -zero_output - strb r2, [r5] ; store eob - vst1.s16 {q0, q1}, [r6@128] ; qcoeff = 0 - vst1.s16 {q0, q1}, [r7@128] ; dqcoeff = 0 - - ldmfd sp!, {r4-r7} - bx lr - - ENDP - -; default inverse zigzag table is defined in vp8/common/entropy.c - ALIGN 16 ; enable use of @128 bit aligned loads -inv_zig_zag - DCW 0x0001, 0x0002, 0x0006, 0x0007 - DCW 0x0003, 0x0005, 0x0008, 0x000d - DCW 0x0004, 0x0009, 0x000c, 0x000e - DCW 0x000a, 0x000b, 0x000f, 0x0010 - - END - diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c new file mode 100644 index 000000000..48764284f --- /dev/null +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp8/encoder/block.h" +#include "vpx_mem/vpx_mem.h" + +static const uint16_t inv_zig_zag[16] = { + 0x0001, 0x0002, 0x0006, 0x0007, + 0x0003, 0x0005, 0x0008, 0x000d, + 0x0004, 0x0009, 0x000c, 0x000e, + 0x000a, 0x000b, 0x000f, 0x0010 +}; + +void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { + const int16x8_t one_q = vdupq_n_s16(0xff), + z0 = vld1q_s16(b->coeff), + z1 = vld1q_s16(b->coeff + 8), + round0 = vld1q_s16(b->round), + round1 = vld1q_s16(b->round + 8), + quant0 = vld1q_s16(b->quant_fast), + quant1 = vld1q_s16(b->quant_fast + 8), + dequant0 = vld1q_s16(d->dequant), + dequant1 = vld1q_s16(d->dequant + 8); + const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), + zig_zag1 = vld1q_u16(inv_zig_zag + 8); + int16x8_t x0, x1, sz0, sz1, y0, y1; + uint16x8_t eob0, eob1; + uint16x4_t eob_d16; + uint32x2_t eob_d32; + uint32x4_t eob_q32; + + /* sign of z: z >> 15 */ + sz0 = vshrq_n_s16(z0, 15); + sz1 = vshrq_n_s16(z1, 15); + + /* x = abs(z) */ + x0 = vabsq_s16(z0); + x1 = vabsq_s16(z1); + + /* x += round */ + x0 = vaddq_s16(x0, round0); + x1 = vaddq_s16(x1, round1); + + /* y = 2 * (x * quant) >> 16 */ + y0 = vqdmulhq_s16(x0, quant0); + y1 = vqdmulhq_s16(x1, quant1); + + /* Compensate for doubling in vqdmulhq */ + y0 = vshrq_n_s16(y0, 1); + y1 = vshrq_n_s16(y1, 1); + + /* Restore sign bit */ + y0 = veorq_s16(y0, sz0); + y1 = veorq_s16(y1, sz1); + x0 = vsubq_s16(y0, sz0); + x1 = vsubq_s16(y1, sz1); + + /* find non-zero elements */ + eob0 = vtstq_s16(x0, one_q); + eob1 = vtstq_s16(x1, one_q); + + /* mask zig zag */ + eob0 = vandq_u16(eob0, zig_zag0); + eob1 = vandq_u16(eob1, zig_zag1); + + /* select the largest value */ + eob0 = vmaxq_u16(eob0, eob1); + eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); + eob_q32 = vmovl_u16(eob_d16); + eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32)); + eob_d32 = vpmax_u32(eob_d32, eob_d32); + + /* qcoeff = x */ + vst1q_s16(d->qcoeff, x0); + vst1q_s16(d->qcoeff + 8, x1); + + /* dqcoeff = x * dequant */ + vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0)); + vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1)); + + vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); +} + +void vp8_fast_quantize_b_pair_neon(BLOCK *b0, BLOCK *b1, + BLOCKD *d0, BLOCKD *d1) { + const int16x8_t one_q = vdupq_n_s16(0xff), + b0_z0 = vld1q_s16(b0->coeff), + b0_z1 = vld1q_s16(b0->coeff + 8), + b0_round0 = vld1q_s16(b0->round), + b0_round1 = vld1q_s16(b0->round + 8), + b0_quant0 = vld1q_s16(b0->quant_fast), + b0_quant1 = vld1q_s16(b0->quant_fast + 8), + d0_dequant0 = vld1q_s16(d0->dequant), + d0_dequant1 = vld1q_s16(d0->dequant + 8), + b1_z0 = vld1q_s16(b1->coeff), + b1_z1 = vld1q_s16(b1->coeff + 8), + b1_round0 = vld1q_s16(b1->round), + b1_round1 = vld1q_s16(b1->round + 8), + b1_quant0 = vld1q_s16(b1->quant_fast), + b1_quant1 = vld1q_s16(b1->quant_fast + 8), + d1_dequant0 = vld1q_s16(d1->dequant), + d1_dequant1 = vld1q_s16(d1->dequant + 8); + const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), + zig_zag1 = vld1q_u16(inv_zig_zag + 8); + int16x8_t b0_x0, b0_x1, b0_sz0, b0_sz1, b0_y0, b0_y1, + b1_x0, b1_x1, b1_sz0, b1_sz1, b1_y0, b1_y1; + uint16x8_t b0_eob0, b0_eob1, + b1_eob0, b1_eob1; + uint16x4_t b0_eob_d16, b1_eob_d16; + uint32x2_t b0_eob_d32, b1_eob_d32; + uint32x4_t b0_eob_q32, b1_eob_q32; + + /* sign of z: z >> 15 */ + b0_sz0 = vshrq_n_s16(b0_z0, 15); + b0_sz1 = vshrq_n_s16(b0_z1, 15); + b1_sz0 = vshrq_n_s16(b1_z0, 15); + b1_sz1 = vshrq_n_s16(b1_z1, 15); + + /* x = abs(z) */ + b0_x0 = vabsq_s16(b0_z0); + b0_x1 = vabsq_s16(b0_z1); + b1_x0 = vabsq_s16(b1_z0); + b1_x1 = vabsq_s16(b1_z1); + + /* x += round */ + b0_x0 = vaddq_s16(b0_x0, b0_round0); + b0_x1 = vaddq_s16(b0_x1, b0_round1); + b1_x0 = vaddq_s16(b1_x0, b1_round0); + b1_x1 = vaddq_s16(b1_x1, b1_round1); + + /* y = 2 * (x * quant) >> 16 */ + b0_y0 = vqdmulhq_s16(b0_x0, b0_quant0); + b0_y1 = vqdmulhq_s16(b0_x1, b0_quant1); + b1_y0 = vqdmulhq_s16(b1_x0, b1_quant0); + b1_y1 = vqdmulhq_s16(b1_x1, b1_quant1); + + /* Compensate for doubling in vqdmulhq */ + b0_y0 = vshrq_n_s16(b0_y0, 1); + b0_y1 = vshrq_n_s16(b0_y1, 1); + b1_y0 = vshrq_n_s16(b1_y0, 1); + b1_y1 = vshrq_n_s16(b1_y1, 1); + + /* Restore sign bit */ + b0_y0 = veorq_s16(b0_y0, b0_sz0); + b0_y1 = veorq_s16(b0_y1, b0_sz1); + b0_x0 = vsubq_s16(b0_y0, b0_sz0); + b0_x1 = vsubq_s16(b0_y1, b0_sz1); + b1_y0 = veorq_s16(b1_y0, b1_sz0); + b1_y1 = veorq_s16(b1_y1, b1_sz1); + b1_x0 = vsubq_s16(b1_y0, b1_sz0); + b1_x1 = vsubq_s16(b1_y1, b1_sz1); + + /* find non-zero elements */ + b0_eob0 = vtstq_s16(b0_x0, one_q); + b0_eob1 = vtstq_s16(b0_x1, one_q); + b1_eob0 = vtstq_s16(b1_x0, one_q); + b1_eob1 = vtstq_s16(b1_x1, one_q); + + /* mask zig zag */ + b0_eob0 = vandq_u16(b0_eob0, zig_zag0); + b0_eob1 = vandq_u16(b0_eob1, zig_zag1); + b1_eob0 = vandq_u16(b1_eob0, zig_zag0); + b1_eob1 = vandq_u16(b1_eob1, zig_zag1); + + /* select the largest value */ + b0_eob0 = vmaxq_u16(b0_eob0, b0_eob1); + b0_eob_d16 = vmax_u16(vget_low_u16(b0_eob0), + vget_high_u16(b0_eob0)); + b0_eob_q32 = vmovl_u16(b0_eob_d16); + b0_eob_d32 = vmax_u32(vget_low_u32(b0_eob_q32), + vget_high_u32(b0_eob_q32)); + b0_eob_d32 = vpmax_u32(b0_eob_d32, b0_eob_d32); + + b1_eob0 = vmaxq_u16(b1_eob0, b1_eob1); + b1_eob_d16 = vmax_u16(vget_low_u16(b1_eob0), + vget_high_u16(b1_eob0)); + b1_eob_q32 = vmovl_u16(b1_eob_d16); + b1_eob_d32 = vmax_u32(vget_low_u32(b1_eob_q32), + vget_high_u32(b1_eob_q32)); + b1_eob_d32 = vpmax_u32(b1_eob_d32, b1_eob_d32); + + /* qcoeff = x */ + vst1q_s16(d0->qcoeff, b0_x0); + vst1q_s16(d0->qcoeff + 8, b0_x1); + vst1q_s16(d1->qcoeff, b1_x0); + vst1q_s16(d1->qcoeff + 8, b1_x1); + + /* dqcoeff = x * dequant */ + vst1q_s16(d0->dqcoeff, vmulq_s16(d0_dequant0, b0_x0)); + vst1q_s16(d0->dqcoeff + 8, vmulq_s16(d0_dequant1, b0_x1)); + vst1q_s16(d1->dqcoeff, vmulq_s16(d1_dequant0, b1_x0)); + vst1q_s16(d1->dqcoeff + 8, vmulq_s16(d1_dequant1, b1_x1)); + + vst1_lane_s8((int8_t *)d0->eob, vreinterpret_s8_u32(b0_eob_d32), 0); + vst1_lane_s8((int8_t *)d1->eob, vreinterpret_s8_u32(b1_eob_d32), 0); + return; +} diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index ed19fd4e1..553b1c405 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -35,9 +35,8 @@ VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) #File list for neon # encoder -VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM) - VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c