2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/idct_neon.h"
15 #include "vpx_dsp/txfm_common.h"
17 #if CONFIG_VP9_HIGHBITDEPTH
18 static INLINE void idct16x16_256_add_load_tran_low_kernel(
19 const tran_low_t **input, int16_t **out) {
22 s = load_tran_low_to_s16q(*input);
28 static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input,
30 idct16x16_256_add_load_tran_low_kernel(&input, &out);
31 idct16x16_256_add_load_tran_low_kernel(&input, &out);
32 idct16x16_256_add_load_tran_low_kernel(&input, &out);
33 idct16x16_256_add_load_tran_low_kernel(&input, &out);
34 idct16x16_256_add_load_tran_low_kernel(&input, &out);
35 idct16x16_256_add_load_tran_low_kernel(&input, &out);
36 idct16x16_256_add_load_tran_low_kernel(&input, &out);
37 idct16x16_256_add_load_tran_low_kernel(&input, &out);
38 idct16x16_256_add_load_tran_low_kernel(&input, &out);
39 idct16x16_256_add_load_tran_low_kernel(&input, &out);
40 idct16x16_256_add_load_tran_low_kernel(&input, &out);
41 idct16x16_256_add_load_tran_low_kernel(&input, &out);
42 idct16x16_256_add_load_tran_low_kernel(&input, &out);
43 idct16x16_256_add_load_tran_low_kernel(&input, &out);
44 idct16x16_256_add_load_tran_low_kernel(&input, &out);
45 idct16x16_256_add_load_tran_low_kernel(&input, &out);
46 idct16x16_256_add_load_tran_low_kernel(&input, &out);
47 idct16x16_256_add_load_tran_low_kernel(&input, &out);
48 idct16x16_256_add_load_tran_low_kernel(&input, &out);
49 idct16x16_256_add_load_tran_low_kernel(&input, &out);
50 idct16x16_256_add_load_tran_low_kernel(&input, &out);
51 idct16x16_256_add_load_tran_low_kernel(&input, &out);
52 idct16x16_256_add_load_tran_low_kernel(&input, &out);
53 idct16x16_256_add_load_tran_low_kernel(&input, &out);
54 idct16x16_256_add_load_tran_low_kernel(&input, &out);
55 idct16x16_256_add_load_tran_low_kernel(&input, &out);
56 idct16x16_256_add_load_tran_low_kernel(&input, &out);
57 idct16x16_256_add_load_tran_low_kernel(&input, &out);
58 idct16x16_256_add_load_tran_low_kernel(&input, &out);
59 idct16x16_256_add_load_tran_low_kernel(&input, &out);
60 idct16x16_256_add_load_tran_low_kernel(&input, &out);
61 idct16x16_256_add_load_tran_low_kernel(&input, &out);
63 #endif // CONFIG_VP9_HIGHBITDEPTH
65 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
66 int16x4_t *const d1) {
67 *d0 = vrshrn_n_s32(t32[0], 14);
68 *d1 = vrshrn_n_s32(t32[1], 14);
71 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
72 const int16x4_t cospi_2_30_10_22,
73 int16x8_t *const d0, int16x8_t *const d1) {
76 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
77 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
78 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
79 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
80 t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
81 t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
82 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
83 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
84 idct16x16_add_wrap_low_8x2(t32, d0, d1);
87 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
88 const int16x4_t cospi_4_12_20N_28,
89 int16x8_t *const d0, int16x8_t *const d1) {
92 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
93 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
94 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
95 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
96 t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
97 t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
98 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
99 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
100 idct16x16_add_wrap_low_8x2(t32, d0, d1);
103 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
104 const int16x4_t cospi_6_26_14_18N,
105 int16x8_t *const d0, int16x8_t *const d1) {
108 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
109 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
110 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0);
111 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0);
112 t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1);
113 t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1);
114 t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1);
115 t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1);
116 idct16x16_add_wrap_low_8x2(t32, d0, d1);
119 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
121 const int16x4_t cospi_0_8_16_24,
122 int32x4_t *const t32) {
123 t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
124 t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
125 t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
126 t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
129 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
130 const int16x4_t cospi_0_8_16_24,
131 int16x4_t *const d0, int16x4_t *const d1) {
134 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
135 wrap_low_4x2(t32, d0, d1);
138 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
139 const int16x4_t cospi_0_8_16_24,
141 int16x4_t *const d1) {
144 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
145 t32[1] = vnegq_s32(t32[1]);
146 wrap_low_4x2(t32, d0, d1);
149 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
150 const int16x4_t cospi_2_30_10_22,
151 int16x8_t *const d0, int16x8_t *const d1) {
154 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
155 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
156 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
157 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
158 t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
159 t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
160 t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
161 t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
162 idct16x16_add_wrap_low_8x2(t32, d0, d1);
165 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
166 const int16x4_t cospi_4_12_20N_28,
167 int16x8_t *const d0, int16x8_t *const d1) {
170 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
171 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
172 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
173 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
174 t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
175 t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
176 t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
177 t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
178 idct16x16_add_wrap_low_8x2(t32, d0, d1);
181 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
182 const int16x4_t cospi_6_26_14_18N,
183 int16x8_t *const d0, int16x8_t *const d1) {
186 t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
187 t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
188 t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2);
189 t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2);
190 t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3);
191 t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3);
192 t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3);
193 t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3);
194 idct16x16_add_wrap_low_8x2(t32, d0, d1);
197 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
198 const int16x4_t cospi_0_8_16_24,
200 int16x4_t *const d1) {
203 t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
204 t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
205 t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
206 wrap_low_4x2(t32, d0, d1);
209 static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
210 uint8_t *dest, int stride) {
211 const int16x8_t cospis0 = vld1q_s16(kCospi);
212 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
213 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
214 const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
215 const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
216 const int16x4_t cospi_6_26_14_18N = vget_high_s16(cospis1);
217 int16x8_t in[16], step1[16], step2[16], out[16];
220 in[0] = vld1q_s16(input);
222 in[8] = vld1q_s16(input);
224 in[1] = vld1q_s16(input);
226 in[9] = vld1q_s16(input);
228 in[2] = vld1q_s16(input);
230 in[10] = vld1q_s16(input);
232 in[3] = vld1q_s16(input);
234 in[11] = vld1q_s16(input);
236 in[4] = vld1q_s16(input);
238 in[12] = vld1q_s16(input);
240 in[5] = vld1q_s16(input);
242 in[13] = vld1q_s16(input);
244 in[6] = vld1q_s16(input);
246 in[14] = vld1q_s16(input);
248 in[7] = vld1q_s16(input);
250 in[15] = vld1q_s16(input);
253 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
255 transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
259 step1[0] = in[0 / 2];
260 step1[1] = in[16 / 2];
261 step1[2] = in[8 / 2];
262 step1[3] = in[24 / 2];
263 step1[4] = in[4 / 2];
264 step1[5] = in[20 / 2];
265 step1[6] = in[12 / 2];
266 step1[7] = in[28 / 2];
267 step1[8] = in[2 / 2];
268 step1[9] = in[18 / 2];
269 step1[10] = in[10 / 2];
270 step1[11] = in[26 / 2];
271 step1[12] = in[6 / 2];
272 step1[13] = in[22 / 2];
273 step1[14] = in[14 / 2];
274 step1[15] = in[30 / 2];
285 idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
286 idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9],
288 idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
290 idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11],
298 idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
299 idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
300 step1[8] = vaddq_s16(step2[8], step2[9]);
301 step1[9] = vsubq_s16(step2[8], step2[9]);
302 step1[10] = vsubq_s16(step2[11], step2[10]);
303 step1[11] = vaddq_s16(step2[11], step2[10]);
304 step1[12] = vaddq_s16(step2[12], step2[13]);
305 step1[13] = vsubq_s16(step2[12], step2[13]);
306 step1[14] = vsubq_s16(step2[15], step2[14]);
307 step1[15] = vaddq_s16(step2[15], step2[14]);
310 idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
311 idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
312 step2[4] = vaddq_s16(step1[4], step1[5]);
313 step2[5] = vsubq_s16(step1[4], step1[5]);
314 step2[6] = vsubq_s16(step1[7], step1[6]);
315 step2[7] = vaddq_s16(step1[7], step1[6]);
317 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
319 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
321 step2[11] = step1[11];
322 step2[12] = step1[12];
323 step2[15] = step1[15];
326 step1[0] = vaddq_s16(step2[0], step2[3]);
327 step1[1] = vaddq_s16(step2[1], step2[2]);
328 step1[2] = vsubq_s16(step2[1], step2[2]);
329 step1[3] = vsubq_s16(step2[0], step2[3]);
331 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
333 step1[8] = vaddq_s16(step2[8], step2[11]);
334 step1[9] = vaddq_s16(step2[9], step2[10]);
335 step1[10] = vsubq_s16(step2[9], step2[10]);
336 step1[11] = vsubq_s16(step2[8], step2[11]);
337 step1[12] = vsubq_s16(step2[15], step2[12]);
338 step1[13] = vsubq_s16(step2[14], step2[13]);
339 step1[14] = vaddq_s16(step2[14], step2[13]);
340 step1[15] = vaddq_s16(step2[15], step2[12]);
343 step2[0] = vaddq_s16(step1[0], step1[7]);
344 step2[1] = vaddq_s16(step1[1], step1[6]);
345 step2[2] = vaddq_s16(step1[2], step1[5]);
346 step2[3] = vaddq_s16(step1[3], step1[4]);
347 step2[4] = vsubq_s16(step1[3], step1[4]);
348 step2[5] = vsubq_s16(step1[2], step1[5]);
349 step2[6] = vsubq_s16(step1[1], step1[6]);
350 step2[7] = vsubq_s16(step1[0], step1[7]);
351 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
353 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
357 step2[14] = step1[14];
358 step2[15] = step1[15];
361 out[0] = vaddq_s16(step2[0], step2[15]);
362 out[1] = vaddq_s16(step2[1], step2[14]);
363 out[2] = vaddq_s16(step2[2], step2[13]);
364 out[3] = vaddq_s16(step2[3], step2[12]);
365 out[4] = vaddq_s16(step2[4], step2[11]);
366 out[5] = vaddq_s16(step2[5], step2[10]);
367 out[6] = vaddq_s16(step2[6], step2[9]);
368 out[7] = vaddq_s16(step2[7], step2[8]);
369 out[8] = vsubq_s16(step2[7], step2[8]);
370 out[9] = vsubq_s16(step2[6], step2[9]);
371 out[10] = vsubq_s16(step2[5], step2[10]);
372 out[11] = vsubq_s16(step2[4], step2[11]);
373 out[12] = vsubq_s16(step2[3], step2[12]);
374 out[13] = vsubq_s16(step2[2], step2[13]);
375 out[14] = vsubq_s16(step2[1], step2[14]);
376 out[15] = vsubq_s16(step2[0], step2[15]);
379 // pass 1: save the result into output
380 vst1q_s16(output, out[0]);
382 vst1q_s16(output, out[1]);
384 vst1q_s16(output, out[2]);
386 vst1q_s16(output, out[3]);
388 vst1q_s16(output, out[4]);
390 vst1q_s16(output, out[5]);
392 vst1q_s16(output, out[6]);
394 vst1q_s16(output, out[7]);
396 vst1q_s16(output, out[8]);
398 vst1q_s16(output, out[9]);
400 vst1q_s16(output, out[10]);
402 vst1q_s16(output, out[11]);
404 vst1q_s16(output, out[12]);
406 vst1q_s16(output, out[13]);
408 vst1q_s16(output, out[14]);
410 vst1q_s16(output, out[15]);
412 // pass 2: add the result to dest.
413 idct16x16_add8x1(out[0], &dest, stride);
414 idct16x16_add8x1(out[1], &dest, stride);
415 idct16x16_add8x1(out[2], &dest, stride);
416 idct16x16_add8x1(out[3], &dest, stride);
417 idct16x16_add8x1(out[4], &dest, stride);
418 idct16x16_add8x1(out[5], &dest, stride);
419 idct16x16_add8x1(out[6], &dest, stride);
420 idct16x16_add8x1(out[7], &dest, stride);
421 idct16x16_add8x1(out[8], &dest, stride);
422 idct16x16_add8x1(out[9], &dest, stride);
423 idct16x16_add8x1(out[10], &dest, stride);
424 idct16x16_add8x1(out[11], &dest, stride);
425 idct16x16_add8x1(out[12], &dest, stride);
426 idct16x16_add8x1(out[13], &dest, stride);
427 idct16x16_add8x1(out[14], &dest, stride);
428 idct16x16_add8x1(out[15], &dest, stride);
432 static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
434 const int16x8_t cospis0 = vld1q_s16(kCospi);
435 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
436 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
437 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
438 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
439 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
440 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
441 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
442 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
443 int16x4_t in[4], step1[16], step2[16], out[16];
446 #if CONFIG_VP9_HIGHBITDEPTH
447 in[0] = load_tran_low_to_s16d(input);
449 in[1] = load_tran_low_to_s16d(input);
451 in[2] = load_tran_low_to_s16d(input);
453 in[3] = load_tran_low_to_s16d(input);
455 in[0] = vld1_s16(input);
457 in[1] = vld1_s16(input);
459 in[2] = vld1_s16(input);
461 in[3] = vld1_s16(input);
462 #endif // CONFIG_VP9_HIGHBITDEPTH
465 transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
468 step1[0] = in[0 / 2];
469 step1[4] = in[4 / 2];
470 step1[8] = in[2 / 2];
471 step1[12] = in[6 / 2];
476 step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
477 step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
478 step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
479 step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
483 step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
484 step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
487 step1[10] = step2[11];
488 step1[11] = step2[11];
489 step1[12] = step2[12];
490 step1[13] = step2[12];
491 step1[14] = step2[15];
492 step1[15] = step2[15];
495 step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
501 idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
503 idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
505 step2[11] = step1[11];
506 step2[12] = step1[12];
507 step2[15] = step1[15];
515 idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
517 step1[8] = vadd_s16(step2[8], step2[11]);
518 step1[9] = vadd_s16(step2[9], step2[10]);
519 step1[10] = vsub_s16(step2[9], step2[10]);
520 step1[11] = vsub_s16(step2[8], step2[11]);
521 step1[12] = vsub_s16(step2[15], step2[12]);
522 step1[13] = vsub_s16(step2[14], step2[13]);
523 step1[14] = vadd_s16(step2[14], step2[13]);
524 step1[15] = vadd_s16(step2[15], step2[12]);
527 step2[0] = vadd_s16(step1[0], step1[7]);
528 step2[1] = vadd_s16(step1[1], step1[6]);
529 step2[2] = vadd_s16(step1[2], step1[5]);
530 step2[3] = vadd_s16(step1[3], step1[4]);
531 step2[4] = vsub_s16(step1[3], step1[4]);
532 step2[5] = vsub_s16(step1[2], step1[5]);
533 step2[6] = vsub_s16(step1[1], step1[6]);
534 step2[7] = vsub_s16(step1[0], step1[7]);
535 idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
537 idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
541 step2[14] = step1[14];
542 step2[15] = step1[15];
545 out[0] = vadd_s16(step2[0], step2[15]);
546 out[1] = vadd_s16(step2[1], step2[14]);
547 out[2] = vadd_s16(step2[2], step2[13]);
548 out[3] = vadd_s16(step2[3], step2[12]);
549 out[4] = vadd_s16(step2[4], step2[11]);
550 out[5] = vadd_s16(step2[5], step2[10]);
551 out[6] = vadd_s16(step2[6], step2[9]);
552 out[7] = vadd_s16(step2[7], step2[8]);
553 out[8] = vsub_s16(step2[7], step2[8]);
554 out[9] = vsub_s16(step2[6], step2[9]);
555 out[10] = vsub_s16(step2[5], step2[10]);
556 out[11] = vsub_s16(step2[4], step2[11]);
557 out[12] = vsub_s16(step2[3], step2[12]);
558 out[13] = vsub_s16(step2[2], step2[13]);
559 out[14] = vsub_s16(step2[1], step2[14]);
560 out[15] = vsub_s16(step2[0], step2[15]);
562 // pass 1: save the result into output
563 vst1_s16(output, out[0]);
565 vst1_s16(output, out[1]);
567 vst1_s16(output, out[2]);
569 vst1_s16(output, out[3]);
571 vst1_s16(output, out[4]);
573 vst1_s16(output, out[5]);
575 vst1_s16(output, out[6]);
577 vst1_s16(output, out[7]);
579 vst1_s16(output, out[8]);
581 vst1_s16(output, out[9]);
583 vst1_s16(output, out[10]);
585 vst1_s16(output, out[11]);
587 vst1_s16(output, out[12]);
589 vst1_s16(output, out[13]);
591 vst1_s16(output, out[14]);
593 vst1_s16(output, out[15]);
596 static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
597 uint8_t *dest, int stride) {
598 const int16x8_t cospis0 = vld1q_s16(kCospi);
599 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
600 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
601 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
602 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
603 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
604 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
605 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
606 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
608 int16x8_t in[4], step1[16], step2[16], out[16];
611 ind[0] = vld1_s16(input);
613 ind[1] = vld1_s16(input);
615 ind[2] = vld1_s16(input);
617 ind[3] = vld1_s16(input);
619 ind[4] = vld1_s16(input);
621 ind[5] = vld1_s16(input);
623 ind[6] = vld1_s16(input);
625 ind[7] = vld1_s16(input);
628 transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
629 ind[7], &in[0], &in[1], &in[2], &in[3]);
632 step1[0] = in[0 / 2];
633 step1[4] = in[4 / 2];
634 step1[8] = in[2 / 2];
635 step1[12] = in[6 / 2];
640 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
641 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
642 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
643 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
647 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
648 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
651 step1[10] = step2[11];
652 step1[11] = step2[11];
653 step1[12] = step2[12];
654 step1[13] = step2[12];
655 step1[14] = step2[15];
656 step1[15] = step2[15];
659 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
665 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
667 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
669 step2[11] = step1[11];
670 step2[12] = step1[12];
671 step2[15] = step1[15];
679 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
681 step1[8] = vaddq_s16(step2[8], step2[11]);
682 step1[9] = vaddq_s16(step2[9], step2[10]);
683 step1[10] = vsubq_s16(step2[9], step2[10]);
684 step1[11] = vsubq_s16(step2[8], step2[11]);
685 step1[12] = vsubq_s16(step2[15], step2[12]);
686 step1[13] = vsubq_s16(step2[14], step2[13]);
687 step1[14] = vaddq_s16(step2[14], step2[13]);
688 step1[15] = vaddq_s16(step2[15], step2[12]);
691 step2[0] = vaddq_s16(step1[0], step1[7]);
692 step2[1] = vaddq_s16(step1[1], step1[6]);
693 step2[2] = vaddq_s16(step1[2], step1[5]);
694 step2[3] = vaddq_s16(step1[3], step1[4]);
695 step2[4] = vsubq_s16(step1[3], step1[4]);
696 step2[5] = vsubq_s16(step1[2], step1[5]);
697 step2[6] = vsubq_s16(step1[1], step1[6]);
698 step2[7] = vsubq_s16(step1[0], step1[7]);
699 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
701 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
705 step2[14] = step1[14];
706 step2[15] = step1[15];
709 out[0] = vaddq_s16(step2[0], step2[15]);
710 out[1] = vaddq_s16(step2[1], step2[14]);
711 out[2] = vaddq_s16(step2[2], step2[13]);
712 out[3] = vaddq_s16(step2[3], step2[12]);
713 out[4] = vaddq_s16(step2[4], step2[11]);
714 out[5] = vaddq_s16(step2[5], step2[10]);
715 out[6] = vaddq_s16(step2[6], step2[9]);
716 out[7] = vaddq_s16(step2[7], step2[8]);
717 out[8] = vsubq_s16(step2[7], step2[8]);
718 out[9] = vsubq_s16(step2[6], step2[9]);
719 out[10] = vsubq_s16(step2[5], step2[10]);
720 out[11] = vsubq_s16(step2[4], step2[11]);
721 out[12] = vsubq_s16(step2[3], step2[12]);
722 out[13] = vsubq_s16(step2[2], step2[13]);
723 out[14] = vsubq_s16(step2[1], step2[14]);
724 out[15] = vsubq_s16(step2[0], step2[15]);
727 // pass 1: save the result into output
728 vst1q_s16(output, out[0]);
730 vst1q_s16(output, out[1]);
732 vst1q_s16(output, out[2]);
734 vst1q_s16(output, out[3]);
736 vst1q_s16(output, out[4]);
738 vst1q_s16(output, out[5]);
740 vst1q_s16(output, out[6]);
742 vst1q_s16(output, out[7]);
744 vst1q_s16(output, out[8]);
746 vst1q_s16(output, out[9]);
748 vst1q_s16(output, out[10]);
750 vst1q_s16(output, out[11]);
752 vst1q_s16(output, out[12]);
754 vst1q_s16(output, out[13]);
756 vst1q_s16(output, out[14]);
758 vst1q_s16(output, out[15]);
760 // pass 2: add the result to dest.
761 idct16x16_add8x1(out[0], &dest, stride);
762 idct16x16_add8x1(out[1], &dest, stride);
763 idct16x16_add8x1(out[2], &dest, stride);
764 idct16x16_add8x1(out[3], &dest, stride);
765 idct16x16_add8x1(out[4], &dest, stride);
766 idct16x16_add8x1(out[5], &dest, stride);
767 idct16x16_add8x1(out[6], &dest, stride);
768 idct16x16_add8x1(out[7], &dest, stride);
769 idct16x16_add8x1(out[8], &dest, stride);
770 idct16x16_add8x1(out[9], &dest, stride);
771 idct16x16_add8x1(out[10], &dest, stride);
772 idct16x16_add8x1(out[11], &dest, stride);
773 idct16x16_add8x1(out[12], &dest, stride);
774 idct16x16_add8x1(out[13], &dest, stride);
775 idct16x16_add8x1(out[14], &dest, stride);
776 idct16x16_add8x1(out[15], &dest, stride);
780 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
782 int16_t row_idct_output[16 * 16];
784 #if CONFIG_VP9_HIGHBITDEPTH
785 int16_t pass1_input[16 * 16];
786 idct16x16_256_add_load_tran_low(input, pass1_input);
788 const int16_t *pass1_input = input;
789 #endif // CONFIG_VP9_HIGHBITDEPTH
792 // Parallel idct on the upper 8 rows
793 idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride);
795 // Parallel idct on the lower 8 rows
796 idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest,
800 // Parallel idct to get the left 8 columns
801 idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride);
803 // Parallel idct to get the right 8 columns
804 idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride);
807 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
809 int16_t row_idct_output[4 * 16];
812 // Parallel idct on the upper 8 rows
813 idct16x16_10_add_half1d_pass1(input, row_idct_output);
816 // Parallel idct to get the left 8 columns
817 idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride);
819 // Parallel idct to get the right 8 columns
820 idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,