From: James Yu Date: Mon, 27 Jan 2014 10:38:35 +0000 (+0800) Subject: VP9 common for ARMv8 by using NEON intrinsics 05 X-Git-Tag: v1.4.0~356^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6b7101327723436ad78047e30ae752429836c528;p=libvpx VP9 common for ARMv8 by using NEON intrinsics 05 Add vp9_iht4x4_add_neon.c - vp9_iht4x4_16_add_neon The assembly did not previously implement tx_type 0 BUG=715 Change-Id: I60034d1568de034edba45c5cdd13f3d87dbc73b6 Signed-off-by: James Yu --- diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index d6a3473ae..5357a8d2d 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -474,14 +474,17 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_neon, 0, VPX_BITS_8))); +#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE + +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( - DISABLED_NEON, Trans4x4HT, + NEON, Trans4x4HT, ::testing::Values( make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8), make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8), make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8), make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8))); -#endif // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH && \ !CONFIG_EMULATE_HARDWARE diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm deleted file mode 100644 index 2f326e24c..000000000 --- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm +++ /dev/null @@ -1,237 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_iht4x4_16_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are - ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain - ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back - ; into d16-d19 registers. This macro will touch q10- q15 registers and use - ; them as buffer during calculation. - MACRO - IDCT4x4_1D - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d2 ; input[1] * cospi_24_64 - vmull.s16 q10, d17, d0 ; input[1] * cospi_8_64 - vmull.s16 q13, d23, d1 ; (input[0] + input[2]) * cospi_16_64 - vmull.s16 q14, d24, d1 ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q15, d19, d0 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlal.s16 q10, d19, d2 ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q10, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - vswp d18, d19 - MEND - - ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which - ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9. - ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be - ; stored back into d16-d19 registers. This macro will touch q11,q12,q13, - ; q14,q15 registers and use them as buffer during calculation. - MACRO - IADST4x4_1D - vmull.s16 q10, d3, d16 ; s0 = sinpi_1_9 * x0 - vmull.s16 q11, d4, d16 ; s1 = sinpi_2_9 * x0 - vmull.s16 q12, d6, d17 ; s2 = sinpi_3_9 * x1 - vmull.s16 q13, d5, d18 ; s3 = sinpi_4_9 * x2 - vmull.s16 q14, d3, d18 ; s4 = sinpi_1_9 * x2 - vmovl.s16 q15, d16 ; expand x0 from 16 bit to 32 bit - vaddw.s16 q15, q15, d19 ; x0 + x3 - vmull.s16 q8, d4, d19 ; s5 = sinpi_2_9 * x3 - vsubw.s16 q15, q15, d18 ; s7 = x0 + x3 - x2 - vmull.s16 q9, d5, d19 ; s6 = sinpi_4_9 * x3 - - vadd.s32 q10, q10, q13 ; x0 = s0 + s3 + s5 - vadd.s32 q10, q10, q8 - vsub.s32 q11, q11, q14 ; x1 = s1 - s4 - s6 - vdup.32 q8, r0 ; duplicate sinpi_3_9 - vsub.s32 q11, q11, q9 - vmul.s32 q15, q15, q8 ; x2 = sinpi_3_9 * s7 - - vadd.s32 q13, q10, q12 ; s0 = x0 + x3 - vadd.s32 q10, q10, q11 ; x0 + x1 - vadd.s32 q14, q11, q12 ; s1 = x1 + x3 - vsub.s32 q10, q10, q12 ; s3 = x0 + x1 - x3 - - ; dct_const_round_shift - vqrshrn.s32 d16, q13, #14 - vqrshrn.s32 d17, q14, #14 - vqrshrn.s32 d18, q15, #14 - vqrshrn.s32 d19, q10, #14 - MEND - - ; Generate cosine constants in d6 - d8 for the IDCT - MACRO - GENERATE_COSINE_CONSTANTS - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x187e - mov r12, #0x1800 - add r12, #0x7e - - ; generate constant vectors - vdup.16 d0, r0 ; duplicate cospi_8_64 - vdup.16 d1, r3 ; duplicate cospi_16_64 - vdup.16 d2, r12 ; duplicate cospi_24_64 - MEND - - ; Generate sine constants in d1 - d4 for the IADST. - MACRO - GENERATE_SINE_CONSTANTS - ; sinpi_1_9 = 5283 = 0x14A3 - mov r0, #0x1400 - add r0, #0xa3 - ; sinpi_2_9 = 9929 = 0x26C9 - mov r3, #0x2600 - add r3, #0xc9 - ; sinpi_4_9 = 15212 = 0x3B6C - mov r12, #0x3b00 - add r12, #0x6c - - ; generate constant vectors - vdup.16 d3, r0 ; duplicate sinpi_1_9 - - ; sinpi_3_9 = 13377 = 0x3441 - mov r0, #0x3400 - add r0, #0x41 - - vdup.16 d4, r3 ; duplicate sinpi_2_9 - vdup.16 d5, r12 ; duplicate sinpi_4_9 - vdup.16 q3, r0 ; duplicate sinpi_3_9 - MEND - - ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19. - MACRO - TRANSPOSE4X4 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - vtrn.32 q8, q9 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride, int tx_type) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride -; r3 int tx_type) -; This function will only handle tx_type of 1,2,3. -|vp9_iht4x4_16_add_neon| PROC - - ; load the inputs into d16-d19 - vld1.s16 {q8,q9}, [r0]! - - ; transpose the input data - TRANSPOSE4X4 - - ; decide the type of transform - cmp r3, #2 - beq idct_iadst - cmp r3, #3 - beq iadst_iadst - -iadst_idct - ; generate constants - GENERATE_COSINE_CONSTANTS - GENERATE_SINE_CONSTANTS - - ; first transform rows - IDCT4x4_1D - - ; transpose the matrix - TRANSPOSE4X4 - - ; then transform columns - IADST4x4_1D - - b end_vp9_iht4x4_16_add_neon - -idct_iadst - ; generate constants - GENERATE_COSINE_CONSTANTS - GENERATE_SINE_CONSTANTS - - ; first transform rows - IADST4x4_1D - - ; transpose the matrix - TRANSPOSE4X4 - - ; then transform columns - IDCT4x4_1D - - b end_vp9_iht4x4_16_add_neon - -iadst_iadst - ; generate constants - GENERATE_SINE_CONSTANTS - - ; first transform rows - IADST4x4_1D - - ; transpose the matrix - TRANSPOSE4X4 - - ; then transform columns - IADST4x4_1D - -end_vp9_iht4x4_16_add_neon - ; ROUND_POWER_OF_TWO(temp_out[j], 4) - vrshr.s16 q8, q8, #4 - vrshr.s16 q9, q9, #4 - - vld1.32 {d26[0]}, [r1], r2 - vld1.32 {d26[1]}, [r1], r2 - vld1.32 {d27[0]}, [r1], r2 - vld1.32 {d27[1]}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d26 - vaddw.u8 q9, q9, d27 - - ; clip_pixel - vqmovun.s16 d26, q8 - vqmovun.s16 d27, q9 - - ; do the stores in reverse order with negative post-increment, by changing - ; the sign of the stride - rsb r2, r2, #0 - vst1.32 {d27[1]}, [r1], r2 - vst1.32 {d27[0]}, [r1], r2 - vst1.32 {d26[1]}, [r1], r2 - vst1.32 {d26[0]}, [r1] ; no post-increment - bx lr - ENDP ; |vp9_iht4x4_16_add_neon| - - END diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c new file mode 100644 index 000000000..cd8c358fd --- /dev/null +++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" + +static int16_t sinpi_1_9 = 0x14a3; +static int16_t sinpi_2_9 = 0x26c9; +static int16_t sinpi_3_9 = 0x3441; +static int16_t sinpi_4_9 = 0x3b6c; +static int16_t cospi_8_64 = 0x3b21; +static int16_t cospi_16_64 = 0x2d41; +static int16_t cospi_24_64 = 0x187e; + +static inline void TRANSPOSE4X4( + int16x8_t *q8s16, + int16x8_t *q9s16) { + int32x4_t q8s32, q9s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + + d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16)); + d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16)); + + q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1])); + q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1])); + q0x2s32 = vtrnq_s32(q8s32, q9s32); + + *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); + *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); + return; +} + +static inline void GENERATE_COSINE_CONSTANTS( + int16x4_t *d0s16, + int16x4_t *d1s16, + int16x4_t *d2s16) { + *d0s16 = vdup_n_s16(cospi_8_64); + *d1s16 = vdup_n_s16(cospi_16_64); + *d2s16 = vdup_n_s16(cospi_24_64); + return; +} + +static inline void GENERATE_SINE_CONSTANTS( + int16x4_t *d3s16, + int16x4_t *d4s16, + int16x4_t *d5s16, + int16x8_t *q3s16) { + *d3s16 = vdup_n_s16(sinpi_1_9); + *d4s16 = vdup_n_s16(sinpi_2_9); + *q3s16 = vdupq_n_s16(sinpi_3_9); + *d5s16 = vdup_n_s16(sinpi_4_9); + return; +} + +static inline void IDCT4x4_1D( + int16x4_t *d0s16, + int16x4_t *d1s16, + int16x4_t *d2s16, + int16x8_t *q8s16, + int16x8_t *q9s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + int32x4_t q10s32, q13s32, q14s32, q15s32; + int16x8_t q13s16, q14s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, *d2s16); + q10s32 = vmull_s16(d17s16, *d0s16); + q13s32 = vmull_s16(d23s16, *d1s16); + q14s32 = vmull_s16(d24s16, *d1s16); + q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); + q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q10s32, 14); + + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + *q8s16 = vaddq_s16(q13s16, q14s16); + *q9s16 = vsubq_s16(q13s16, q14s16); + *q9s16 = vcombine_s16(vget_high_s16(*q9s16), + vget_low_s16(*q9s16)); // vswp + return; +} + +static inline void IADST4x4_1D( + int16x4_t *d3s16, + int16x4_t *d4s16, + int16x4_t *d5s16, + int16x8_t *q3s16, + int16x8_t *q8s16, + int16x8_t *q9s16) { + int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; + + d6s16 = vget_low_s16(*q3s16); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + + q10s32 = vmull_s16(*d3s16, d16s16); + q11s32 = vmull_s16(*d4s16, d16s16); + q12s32 = vmull_s16(d6s16, d17s16); + q13s32 = vmull_s16(*d5s16, d18s16); + q14s32 = vmull_s16(*d3s16, d18s16); + q15s32 = vmovl_s16(d16s16); + q15s32 = vaddw_s16(q15s32, d19s16); + q8s32 = vmull_s16(*d4s16, d19s16); + q15s32 = vsubw_s16(q15s32, d18s16); + q9s32 = vmull_s16(*d5s16, d19s16); + + q10s32 = vaddq_s32(q10s32, q13s32); + q10s32 = vaddq_s32(q10s32, q8s32); + q11s32 = vsubq_s32(q11s32, q14s32); + q8s32 = vdupq_n_s32(sinpi_3_9); + q11s32 = vsubq_s32(q11s32, q9s32); + q15s32 = vmulq_s32(q15s32, q8s32); + + q13s32 = vaddq_s32(q10s32, q12s32); + q10s32 = vaddq_s32(q10s32, q11s32); + q14s32 = vaddq_s32(q11s32, q12s32); + q10s32 = vsubq_s32(q10s32, q12s32); + + d16s16 = vqrshrn_n_s32(q13s32, 14); + d17s16 = vqrshrn_n_s32(q14s32, 14); + d18s16 = vqrshrn_n_s32(q15s32, 14); + d19s16 = vqrshrn_n_s32(q10s32, 14); + + *q8s16 = vcombine_s16(d16s16, d17s16); + *q9s16 = vcombine_s16(d18s16, d19s16); + return; +} + +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, + int dest_stride, int tx_type) { + uint8x8_t d26u8, d27u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; + uint32x2_t d26u32, d27u32; + int16x8_t q3s16, q8s16, q9s16; + uint16x8_t q8u16, q9u16; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + TRANSPOSE4X4(&q8s16, &q9s16); + + switch (tx_type) { + case 0: // idct_idct is not supported. Fall back to C + vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type); + return; + break; + case 1: // iadst_idct + // generate constants + GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + break; + case 2: // idct_iadst + // generate constantsyy + GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + break; + case 3: // iadst_iadst + // generate constants + GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); + + // first transform rows + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + + // transpose the matrix + TRANSPOSE4X4(&q8s16, &q9s16); + + // then transform columns + IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + break; + default: // iadst_idct + assert(0); + break; + } + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); + dest += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); + dest += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); + dest += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); + dest -= dest_stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); + return; +} diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 575990bb5..4c0df912f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -457,8 +457,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_idct32x32_1_add sse2 neon dspr2/; add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; - specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/; - $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon; + specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/; add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 2504f4db9..c2d1a7af2 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -135,12 +135,13 @@ endif VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM) -VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM) +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c + # neon with assembly and intrinsics implementations. If both are available # prefer assembly. ifeq ($(HAVE_NEON_ASM), yes)