2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/idct_neon.h"
15 #include "vpx_dsp/txfm_common.h"
17 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
18 int16x4_t *const d1) {
19 *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
20 *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
23 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
25 const int16x4_t cospi_0_8_16_24,
26 int32x4_t *const t32) {
27 t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
28 t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
29 t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
30 t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
33 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
34 const int16x4_t cospi_0_8_16_24,
35 int16x4_t *const d0, int16x4_t *const d1) {
38 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
39 wrap_low_4x2(t32, d0, d1);
42 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
43 const int16x4_t cospi_0_8_16_24,
45 int16x4_t *const d1) {
48 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
49 t32[1] = vnegq_s32(t32[1]);
50 wrap_low_4x2(t32, d0, d1);
53 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
54 const int16x4_t cospi_0_8_16_24,
56 int16x4_t *const d1) {
59 t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
60 t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
61 t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
62 wrap_low_4x2(t32, d0, d1);
65 static INLINE void idct16x16_add_store(const int16x8_t *const out,
66 uint8_t *dest, const int stride) {
67 // Add the result to dest
68 idct16x16_add8x1(out[0], &dest, stride);
69 idct16x16_add8x1(out[1], &dest, stride);
70 idct16x16_add8x1(out[2], &dest, stride);
71 idct16x16_add8x1(out[3], &dest, stride);
72 idct16x16_add8x1(out[4], &dest, stride);
73 idct16x16_add8x1(out[5], &dest, stride);
74 idct16x16_add8x1(out[6], &dest, stride);
75 idct16x16_add8x1(out[7], &dest, stride);
76 idct16x16_add8x1(out[8], &dest, stride);
77 idct16x16_add8x1(out[9], &dest, stride);
78 idct16x16_add8x1(out[10], &dest, stride);
79 idct16x16_add8x1(out[11], &dest, stride);
80 idct16x16_add8x1(out[12], &dest, stride);
81 idct16x16_add8x1(out[13], &dest, stride);
82 idct16x16_add8x1(out[14], &dest, stride);
83 idct16x16_add8x1(out[15], &dest, stride);
86 static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
88 // Add the result to dest
89 const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
90 out[0] = vrshrq_n_s16(out[0], 6);
91 out[1] = vrshrq_n_s16(out[1], 6);
92 out[2] = vrshrq_n_s16(out[2], 6);
93 out[3] = vrshrq_n_s16(out[3], 6);
94 out[4] = vrshrq_n_s16(out[4], 6);
95 out[5] = vrshrq_n_s16(out[5], 6);
96 out[6] = vrshrq_n_s16(out[6], 6);
97 out[7] = vrshrq_n_s16(out[7], 6);
98 out[8] = vrshrq_n_s16(out[8], 6);
99 out[9] = vrshrq_n_s16(out[9], 6);
100 out[10] = vrshrq_n_s16(out[10], 6);
101 out[11] = vrshrq_n_s16(out[11], 6);
102 out[12] = vrshrq_n_s16(out[12], 6);
103 out[13] = vrshrq_n_s16(out[13], 6);
104 out[14] = vrshrq_n_s16(out[14], 6);
105 out[15] = vrshrq_n_s16(out[15], 6);
106 highbd_idct16x16_add8x1(out[0], max, &dest, stride);
107 highbd_idct16x16_add8x1(out[1], max, &dest, stride);
108 highbd_idct16x16_add8x1(out[2], max, &dest, stride);
109 highbd_idct16x16_add8x1(out[3], max, &dest, stride);
110 highbd_idct16x16_add8x1(out[4], max, &dest, stride);
111 highbd_idct16x16_add8x1(out[5], max, &dest, stride);
112 highbd_idct16x16_add8x1(out[6], max, &dest, stride);
113 highbd_idct16x16_add8x1(out[7], max, &dest, stride);
114 highbd_idct16x16_add8x1(out[8], max, &dest, stride);
115 highbd_idct16x16_add8x1(out[9], max, &dest, stride);
116 highbd_idct16x16_add8x1(out[10], max, &dest, stride);
117 highbd_idct16x16_add8x1(out[11], max, &dest, stride);
118 highbd_idct16x16_add8x1(out[12], max, &dest, stride);
119 highbd_idct16x16_add8x1(out[13], max, &dest, stride);
120 highbd_idct16x16_add8x1(out[14], max, &dest, stride);
121 highbd_idct16x16_add8x1(out[15], max, &dest, stride);
124 void idct16x16_256_add_half1d(const void *const input, int16_t *output,
125 void *const dest, const int stride,
126 const int highbd_flag) {
127 const int16x8_t cospis0 = vld1q_s16(kCospi);
128 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
129 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
130 const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
131 const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
132 const int16x4_t cospi_6_26N_14_18N = vget_high_s16(cospis1);
133 int16x8_t in[16], step1[16], step2[16], out[16];
137 const tran_low_t *inputT = (const tran_low_t *)input;
138 in[0] = load_tran_low_to_s16q(inputT);
140 in[8] = load_tran_low_to_s16q(inputT);
142 in[1] = load_tran_low_to_s16q(inputT);
144 in[9] = load_tran_low_to_s16q(inputT);
146 in[2] = load_tran_low_to_s16q(inputT);
148 in[10] = load_tran_low_to_s16q(inputT);
150 in[3] = load_tran_low_to_s16q(inputT);
152 in[11] = load_tran_low_to_s16q(inputT);
154 in[4] = load_tran_low_to_s16q(inputT);
156 in[12] = load_tran_low_to_s16q(inputT);
158 in[5] = load_tran_low_to_s16q(inputT);
160 in[13] = load_tran_low_to_s16q(inputT);
162 in[6] = load_tran_low_to_s16q(inputT);
164 in[14] = load_tran_low_to_s16q(inputT);
166 in[7] = load_tran_low_to_s16q(inputT);
168 in[15] = load_tran_low_to_s16q(inputT);
170 const int16_t *inputT = (const int16_t *)input;
171 in[0] = vld1q_s16(inputT);
173 in[8] = vld1q_s16(inputT);
175 in[1] = vld1q_s16(inputT);
177 in[9] = vld1q_s16(inputT);
179 in[2] = vld1q_s16(inputT);
181 in[10] = vld1q_s16(inputT);
183 in[3] = vld1q_s16(inputT);
185 in[11] = vld1q_s16(inputT);
187 in[4] = vld1q_s16(inputT);
189 in[12] = vld1q_s16(inputT);
191 in[5] = vld1q_s16(inputT);
193 in[13] = vld1q_s16(inputT);
195 in[6] = vld1q_s16(inputT);
197 in[14] = vld1q_s16(inputT);
199 in[7] = vld1q_s16(inputT);
201 in[15] = vld1q_s16(inputT);
205 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
207 transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
211 step1[0] = in[0 / 2];
212 step1[1] = in[16 / 2];
213 step1[2] = in[8 / 2];
214 step1[3] = in[24 / 2];
215 step1[4] = in[4 / 2];
216 step1[5] = in[20 / 2];
217 step1[6] = in[12 / 2];
218 step1[7] = in[28 / 2];
219 step1[8] = in[2 / 2];
220 step1[9] = in[18 / 2];
221 step1[10] = in[10 / 2];
222 step1[11] = in[26 / 2];
223 step1[12] = in[6 / 2];
224 step1[13] = in[22 / 2];
225 step1[14] = in[14 / 2];
226 step1[15] = in[30 / 2];
237 idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
238 idct_cospi_14_18(step1[9], step1[14], cospi_6_26N_14_18N, &step2[9],
240 idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
242 idct_cospi_6_26(step1[11], step1[12], cospi_6_26N_14_18N, &step2[11],
250 idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
251 idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
252 step1[8] = vaddq_s16(step2[8], step2[9]);
253 step1[9] = vsubq_s16(step2[8], step2[9]);
254 step1[10] = vsubq_s16(step2[11], step2[10]);
255 step1[11] = vaddq_s16(step2[11], step2[10]);
256 step1[12] = vaddq_s16(step2[12], step2[13]);
257 step1[13] = vsubq_s16(step2[12], step2[13]);
258 step1[14] = vsubq_s16(step2[15], step2[14]);
259 step1[15] = vaddq_s16(step2[15], step2[14]);
262 idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
263 idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
264 step2[4] = vaddq_s16(step1[4], step1[5]);
265 step2[5] = vsubq_s16(step1[4], step1[5]);
266 step2[6] = vsubq_s16(step1[7], step1[6]);
267 step2[7] = vaddq_s16(step1[7], step1[6]);
269 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
271 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
273 step2[11] = step1[11];
274 step2[12] = step1[12];
275 step2[15] = step1[15];
278 step1[0] = vaddq_s16(step2[0], step2[3]);
279 step1[1] = vaddq_s16(step2[1], step2[2]);
280 step1[2] = vsubq_s16(step2[1], step2[2]);
281 step1[3] = vsubq_s16(step2[0], step2[3]);
283 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
285 step1[8] = vaddq_s16(step2[8], step2[11]);
286 step1[9] = vaddq_s16(step2[9], step2[10]);
287 step1[10] = vsubq_s16(step2[9], step2[10]);
288 step1[11] = vsubq_s16(step2[8], step2[11]);
289 step1[12] = vsubq_s16(step2[15], step2[12]);
290 step1[13] = vsubq_s16(step2[14], step2[13]);
291 step1[14] = vaddq_s16(step2[14], step2[13]);
292 step1[15] = vaddq_s16(step2[15], step2[12]);
295 step2[0] = vaddq_s16(step1[0], step1[7]);
296 step2[1] = vaddq_s16(step1[1], step1[6]);
297 step2[2] = vaddq_s16(step1[2], step1[5]);
298 step2[3] = vaddq_s16(step1[3], step1[4]);
299 step2[4] = vsubq_s16(step1[3], step1[4]);
300 step2[5] = vsubq_s16(step1[2], step1[5]);
301 step2[6] = vsubq_s16(step1[1], step1[6]);
302 step2[7] = vsubq_s16(step1[0], step1[7]);
303 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
305 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
309 step2[14] = step1[14];
310 step2[15] = step1[15];
313 idct16x16_add_stage7(step2, out);
316 idct16x16_store_pass1(out, output);
319 idct16x16_add_store_bd8(out, dest, stride);
321 idct16x16_add_store(out, dest, stride);
326 void idct16x16_38_add_half1d(const void *const input, int16_t *const output,
327 void *const dest, const int stride,
328 const int highbd_flag) {
329 const int16x8_t cospis0 = vld1q_s16(kCospi);
330 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
331 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
332 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
333 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
334 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
335 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
336 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
337 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
338 int16x8_t in[8], step1[16], step2[16], out[16];
342 const tran_low_t *inputT = (const tran_low_t *)input;
343 in[0] = load_tran_low_to_s16q(inputT);
345 in[1] = load_tran_low_to_s16q(inputT);
347 in[2] = load_tran_low_to_s16q(inputT);
349 in[3] = load_tran_low_to_s16q(inputT);
351 in[4] = load_tran_low_to_s16q(inputT);
353 in[5] = load_tran_low_to_s16q(inputT);
355 in[6] = load_tran_low_to_s16q(inputT);
357 in[7] = load_tran_low_to_s16q(inputT);
359 const int16_t *inputT = (const int16_t *)input;
360 in[0] = vld1q_s16(inputT);
362 in[1] = vld1q_s16(inputT);
364 in[2] = vld1q_s16(inputT);
366 in[3] = vld1q_s16(inputT);
368 in[4] = vld1q_s16(inputT);
370 in[5] = vld1q_s16(inputT);
372 in[6] = vld1q_s16(inputT);
374 in[7] = vld1q_s16(inputT);
378 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
382 step1[0] = in[0 / 2];
383 step1[2] = in[8 / 2];
384 step1[4] = in[4 / 2];
385 step1[6] = in[12 / 2];
386 step1[8] = in[2 / 2];
387 step1[10] = in[10 / 2];
388 step1[12] = in[6 / 2];
389 step1[14] = in[14 / 2]; // 0 in pass 1
396 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
397 step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
398 step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
399 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
400 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
401 step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
402 step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
403 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
408 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
409 step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
410 step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
411 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
412 step1[8] = vaddq_s16(step2[8], step2[9]);
413 step1[9] = vsubq_s16(step2[8], step2[9]);
414 step1[10] = vsubq_s16(step2[11], step2[10]);
415 step1[11] = vaddq_s16(step2[11], step2[10]);
416 step1[12] = vaddq_s16(step2[12], step2[13]);
417 step1[13] = vsubq_s16(step2[12], step2[13]);
418 step1[14] = vsubq_s16(step2[15], step2[14]);
419 step1[15] = vaddq_s16(step2[15], step2[14]);
422 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
423 step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
424 step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
425 step2[4] = vaddq_s16(step1[4], step1[5]);
426 step2[5] = vsubq_s16(step1[4], step1[5]);
427 step2[6] = vsubq_s16(step1[7], step1[6]);
428 step2[7] = vaddq_s16(step1[7], step1[6]);
430 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
432 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
434 step2[11] = step1[11];
435 step2[12] = step1[12];
436 step2[15] = step1[15];
439 step1[0] = vaddq_s16(step2[0], step2[3]);
440 step1[1] = vaddq_s16(step2[1], step2[2]);
441 step1[2] = vsubq_s16(step2[1], step2[2]);
442 step1[3] = vsubq_s16(step2[0], step2[3]);
444 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
446 step1[8] = vaddq_s16(step2[8], step2[11]);
447 step1[9] = vaddq_s16(step2[9], step2[10]);
448 step1[10] = vsubq_s16(step2[9], step2[10]);
449 step1[11] = vsubq_s16(step2[8], step2[11]);
450 step1[12] = vsubq_s16(step2[15], step2[12]);
451 step1[13] = vsubq_s16(step2[14], step2[13]);
452 step1[14] = vaddq_s16(step2[14], step2[13]);
453 step1[15] = vaddq_s16(step2[15], step2[12]);
456 step2[0] = vaddq_s16(step1[0], step1[7]);
457 step2[1] = vaddq_s16(step1[1], step1[6]);
458 step2[2] = vaddq_s16(step1[2], step1[5]);
459 step2[3] = vaddq_s16(step1[3], step1[4]);
460 step2[4] = vsubq_s16(step1[3], step1[4]);
461 step2[5] = vsubq_s16(step1[2], step1[5]);
462 step2[6] = vsubq_s16(step1[1], step1[6]);
463 step2[7] = vsubq_s16(step1[0], step1[7]);
464 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
466 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
470 step2[14] = step1[14];
471 step2[15] = step1[15];
474 idct16x16_add_stage7(step2, out);
477 idct16x16_store_pass1(out, output);
480 idct16x16_add_store_bd8(out, dest, stride);
482 idct16x16_add_store(out, dest, stride);
487 void idct16x16_10_add_half1d_pass1(const tran_low_t *input, int16_t *output) {
488 const int16x8_t cospis0 = vld1q_s16(kCospi);
489 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
490 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
491 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
492 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
493 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
494 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
495 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
496 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
497 int16x4_t in[4], step1[16], step2[16], out[16];
500 in[0] = load_tran_low_to_s16d(input);
502 in[1] = load_tran_low_to_s16d(input);
504 in[2] = load_tran_low_to_s16d(input);
506 in[3] = load_tran_low_to_s16d(input);
509 transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
512 step1[0] = in[0 / 2];
513 step1[4] = in[4 / 2];
514 step1[8] = in[2 / 2];
515 step1[12] = in[6 / 2];
520 step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
521 step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
522 step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
523 step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
527 step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
528 step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
531 step1[10] = step2[11];
532 step1[11] = step2[11];
533 step1[12] = step2[12];
534 step1[13] = step2[12];
535 step1[14] = step2[15];
536 step1[15] = step2[15];
539 step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
545 idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
547 idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
549 step2[11] = step1[11];
550 step2[12] = step1[12];
551 step2[15] = step1[15];
559 idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
561 step1[8] = vadd_s16(step2[8], step2[11]);
562 step1[9] = vadd_s16(step2[9], step2[10]);
563 step1[10] = vsub_s16(step2[9], step2[10]);
564 step1[11] = vsub_s16(step2[8], step2[11]);
565 step1[12] = vsub_s16(step2[15], step2[12]);
566 step1[13] = vsub_s16(step2[14], step2[13]);
567 step1[14] = vadd_s16(step2[14], step2[13]);
568 step1[15] = vadd_s16(step2[15], step2[12]);
571 step2[0] = vadd_s16(step1[0], step1[7]);
572 step2[1] = vadd_s16(step1[1], step1[6]);
573 step2[2] = vadd_s16(step1[2], step1[5]);
574 step2[3] = vadd_s16(step1[3], step1[4]);
575 step2[4] = vsub_s16(step1[3], step1[4]);
576 step2[5] = vsub_s16(step1[2], step1[5]);
577 step2[6] = vsub_s16(step1[1], step1[6]);
578 step2[7] = vsub_s16(step1[0], step1[7]);
579 idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
581 idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
585 step2[14] = step1[14];
586 step2[15] = step1[15];
589 out[0] = vadd_s16(step2[0], step2[15]);
590 out[1] = vadd_s16(step2[1], step2[14]);
591 out[2] = vadd_s16(step2[2], step2[13]);
592 out[3] = vadd_s16(step2[3], step2[12]);
593 out[4] = vadd_s16(step2[4], step2[11]);
594 out[5] = vadd_s16(step2[5], step2[10]);
595 out[6] = vadd_s16(step2[6], step2[9]);
596 out[7] = vadd_s16(step2[7], step2[8]);
597 out[8] = vsub_s16(step2[7], step2[8]);
598 out[9] = vsub_s16(step2[6], step2[9]);
599 out[10] = vsub_s16(step2[5], step2[10]);
600 out[11] = vsub_s16(step2[4], step2[11]);
601 out[12] = vsub_s16(step2[3], step2[12]);
602 out[13] = vsub_s16(step2[2], step2[13]);
603 out[14] = vsub_s16(step2[1], step2[14]);
604 out[15] = vsub_s16(step2[0], step2[15]);
606 // pass 1: save the result into output
607 vst1_s16(output, out[0]);
609 vst1_s16(output, out[1]);
611 vst1_s16(output, out[2]);
613 vst1_s16(output, out[3]);
615 vst1_s16(output, out[4]);
617 vst1_s16(output, out[5]);
619 vst1_s16(output, out[6]);
621 vst1_s16(output, out[7]);
623 vst1_s16(output, out[8]);
625 vst1_s16(output, out[9]);
627 vst1_s16(output, out[10]);
629 vst1_s16(output, out[11]);
631 vst1_s16(output, out[12]);
633 vst1_s16(output, out[13]);
635 vst1_s16(output, out[14]);
637 vst1_s16(output, out[15]);
640 void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *const output,
641 void *const dest, const int stride,
642 const int highbd_flag) {
643 const int16x8_t cospis0 = vld1q_s16(kCospi);
644 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
645 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
646 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
647 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
648 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
649 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
650 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
651 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
653 int16x8_t in[4], step1[16], step2[16], out[16];
656 ind[0] = vld1_s16(input);
658 ind[1] = vld1_s16(input);
660 ind[2] = vld1_s16(input);
662 ind[3] = vld1_s16(input);
664 ind[4] = vld1_s16(input);
666 ind[5] = vld1_s16(input);
668 ind[6] = vld1_s16(input);
670 ind[7] = vld1_s16(input);
673 transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
674 ind[7], &in[0], &in[1], &in[2], &in[3]);
677 step1[0] = in[0 / 2];
678 step1[4] = in[4 / 2];
679 step1[8] = in[2 / 2];
680 step1[12] = in[6 / 2];
685 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
686 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
687 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
688 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
692 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
693 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
696 step1[10] = step2[11];
697 step1[11] = step2[11];
698 step1[12] = step2[12];
699 step1[13] = step2[12];
700 step1[14] = step2[15];
701 step1[15] = step2[15];
704 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
710 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
712 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
714 step2[11] = step1[11];
715 step2[12] = step1[12];
716 step2[15] = step1[15];
724 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
726 step1[8] = vaddq_s16(step2[8], step2[11]);
727 step1[9] = vaddq_s16(step2[9], step2[10]);
728 step1[10] = vsubq_s16(step2[9], step2[10]);
729 step1[11] = vsubq_s16(step2[8], step2[11]);
730 step1[12] = vsubq_s16(step2[15], step2[12]);
731 step1[13] = vsubq_s16(step2[14], step2[13]);
732 step1[14] = vaddq_s16(step2[14], step2[13]);
733 step1[15] = vaddq_s16(step2[15], step2[12]);
736 step2[0] = vaddq_s16(step1[0], step1[7]);
737 step2[1] = vaddq_s16(step1[1], step1[6]);
738 step2[2] = vaddq_s16(step1[2], step1[5]);
739 step2[3] = vaddq_s16(step1[3], step1[4]);
740 step2[4] = vsubq_s16(step1[3], step1[4]);
741 step2[5] = vsubq_s16(step1[2], step1[5]);
742 step2[6] = vsubq_s16(step1[1], step1[6]);
743 step2[7] = vsubq_s16(step1[0], step1[7]);
744 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
746 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
750 step2[14] = step1[14];
751 step2[15] = step1[15];
754 idct16x16_add_stage7(step2, out);
757 idct16x16_store_pass1(out, output);
760 idct16x16_add_store_bd8(out, dest, stride);
762 idct16x16_add_store(out, dest, stride);
767 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
769 int16_t row_idct_output[16 * 16];
772 // Parallel idct on the upper 8 rows
773 idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
775 // Parallel idct on the lower 8 rows
776 idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride,
780 // Parallel idct to get the left 8 columns
781 idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
783 // Parallel idct to get the right 8 columns
784 idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 0);
787 void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
789 int16_t row_idct_output[16 * 16];
792 // Parallel idct on the upper 8 rows
793 idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
796 // Parallel idct to get the left 8 columns
797 idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
799 // Parallel idct to get the right 8 columns
800 idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 0);
803 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
805 int16_t row_idct_output[4 * 16];
808 // Parallel idct on the upper 8 rows
809 idct16x16_10_add_half1d_pass1(input, row_idct_output);
812 // Parallel idct to get the left 8 columns
813 idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
815 // Parallel idct to get the right 8 columns
816 idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, stride,