2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/idct_neon.h"
15 #include "vpx_dsp/txfm_common.h"
17 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
18 int16x4_t *const d1) {
19 *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
20 *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
23 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
25 const int16x4_t cospi_0_8_16_24,
26 int32x4_t *const t32) {
27 t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
28 t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
29 t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
30 t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
33 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
34 const int16x4_t cospi_0_8_16_24,
35 int16x4_t *const d0, int16x4_t *const d1) {
38 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
39 wrap_low_4x2(t32, d0, d1);
42 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
43 const int16x4_t cospi_0_8_16_24,
45 int16x4_t *const d1) {
48 idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
49 t32[1] = vnegq_s32(t32[1]);
50 wrap_low_4x2(t32, d0, d1);
53 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
54 const int16x4_t cospi_0_8_16_24,
56 int16x4_t *const d1) {
59 t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
60 t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
61 t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
62 wrap_low_4x2(t32, d0, d1);
65 static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
67 // Save the result into output
68 vst1q_s16(output, out[0]);
70 vst1q_s16(output, out[1]);
72 vst1q_s16(output, out[2]);
74 vst1q_s16(output, out[3]);
76 vst1q_s16(output, out[4]);
78 vst1q_s16(output, out[5]);
80 vst1q_s16(output, out[6]);
82 vst1q_s16(output, out[7]);
84 vst1q_s16(output, out[8]);
86 vst1q_s16(output, out[9]);
88 vst1q_s16(output, out[10]);
90 vst1q_s16(output, out[11]);
92 vst1q_s16(output, out[12]);
94 vst1q_s16(output, out[13]);
96 vst1q_s16(output, out[14]);
98 vst1q_s16(output, out[15]);
101 static INLINE void idct16x16_add_store(const int16x8_t *const out,
102 uint8_t *dest, const int stride) {
103 // Add the result to dest
104 idct16x16_add8x1(out[0], &dest, stride);
105 idct16x16_add8x1(out[1], &dest, stride);
106 idct16x16_add8x1(out[2], &dest, stride);
107 idct16x16_add8x1(out[3], &dest, stride);
108 idct16x16_add8x1(out[4], &dest, stride);
109 idct16x16_add8x1(out[5], &dest, stride);
110 idct16x16_add8x1(out[6], &dest, stride);
111 idct16x16_add8x1(out[7], &dest, stride);
112 idct16x16_add8x1(out[8], &dest, stride);
113 idct16x16_add8x1(out[9], &dest, stride);
114 idct16x16_add8x1(out[10], &dest, stride);
115 idct16x16_add8x1(out[11], &dest, stride);
116 idct16x16_add8x1(out[12], &dest, stride);
117 idct16x16_add8x1(out[13], &dest, stride);
118 idct16x16_add8x1(out[14], &dest, stride);
119 idct16x16_add8x1(out[15], &dest, stride);
122 void idct16x16_256_add_half1d(const void *const input, int16_t *output,
123 void *const dest, const int stride,
124 const int highbd_flag) {
125 const int16x8_t cospis0 = vld1q_s16(kCospi);
126 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
127 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
128 const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
129 const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
130 const int16x4_t cospi_6_26_14_18N = vget_high_s16(cospis1);
131 int16x8_t in[16], step1[16], step2[16], out[16];
135 const tran_low_t *inputT = (const tran_low_t *)input;
136 in[0] = load_tran_low_to_s16q(inputT);
138 in[8] = load_tran_low_to_s16q(inputT);
140 in[1] = load_tran_low_to_s16q(inputT);
142 in[9] = load_tran_low_to_s16q(inputT);
144 in[2] = load_tran_low_to_s16q(inputT);
146 in[10] = load_tran_low_to_s16q(inputT);
148 in[3] = load_tran_low_to_s16q(inputT);
150 in[11] = load_tran_low_to_s16q(inputT);
152 in[4] = load_tran_low_to_s16q(inputT);
154 in[12] = load_tran_low_to_s16q(inputT);
156 in[5] = load_tran_low_to_s16q(inputT);
158 in[13] = load_tran_low_to_s16q(inputT);
160 in[6] = load_tran_low_to_s16q(inputT);
162 in[14] = load_tran_low_to_s16q(inputT);
164 in[7] = load_tran_low_to_s16q(inputT);
166 in[15] = load_tran_low_to_s16q(inputT);
168 const int16_t *inputT = (const int16_t *)input;
169 in[0] = vld1q_s16(inputT);
171 in[8] = vld1q_s16(inputT);
173 in[1] = vld1q_s16(inputT);
175 in[9] = vld1q_s16(inputT);
177 in[2] = vld1q_s16(inputT);
179 in[10] = vld1q_s16(inputT);
181 in[3] = vld1q_s16(inputT);
183 in[11] = vld1q_s16(inputT);
185 in[4] = vld1q_s16(inputT);
187 in[12] = vld1q_s16(inputT);
189 in[5] = vld1q_s16(inputT);
191 in[13] = vld1q_s16(inputT);
193 in[6] = vld1q_s16(inputT);
195 in[14] = vld1q_s16(inputT);
197 in[7] = vld1q_s16(inputT);
199 in[15] = vld1q_s16(inputT);
203 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
205 transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
209 step1[0] = in[0 / 2];
210 step1[1] = in[16 / 2];
211 step1[2] = in[8 / 2];
212 step1[3] = in[24 / 2];
213 step1[4] = in[4 / 2];
214 step1[5] = in[20 / 2];
215 step1[6] = in[12 / 2];
216 step1[7] = in[28 / 2];
217 step1[8] = in[2 / 2];
218 step1[9] = in[18 / 2];
219 step1[10] = in[10 / 2];
220 step1[11] = in[26 / 2];
221 step1[12] = in[6 / 2];
222 step1[13] = in[22 / 2];
223 step1[14] = in[14 / 2];
224 step1[15] = in[30 / 2];
235 idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
236 idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9],
238 idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
240 idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11],
248 idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
249 idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
250 step1[8] = vaddq_s16(step2[8], step2[9]);
251 step1[9] = vsubq_s16(step2[8], step2[9]);
252 step1[10] = vsubq_s16(step2[11], step2[10]);
253 step1[11] = vaddq_s16(step2[11], step2[10]);
254 step1[12] = vaddq_s16(step2[12], step2[13]);
255 step1[13] = vsubq_s16(step2[12], step2[13]);
256 step1[14] = vsubq_s16(step2[15], step2[14]);
257 step1[15] = vaddq_s16(step2[15], step2[14]);
260 idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
261 idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
262 step2[4] = vaddq_s16(step1[4], step1[5]);
263 step2[5] = vsubq_s16(step1[4], step1[5]);
264 step2[6] = vsubq_s16(step1[7], step1[6]);
265 step2[7] = vaddq_s16(step1[7], step1[6]);
267 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
269 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
271 step2[11] = step1[11];
272 step2[12] = step1[12];
273 step2[15] = step1[15];
276 step1[0] = vaddq_s16(step2[0], step2[3]);
277 step1[1] = vaddq_s16(step2[1], step2[2]);
278 step1[2] = vsubq_s16(step2[1], step2[2]);
279 step1[3] = vsubq_s16(step2[0], step2[3]);
281 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
283 step1[8] = vaddq_s16(step2[8], step2[11]);
284 step1[9] = vaddq_s16(step2[9], step2[10]);
285 step1[10] = vsubq_s16(step2[9], step2[10]);
286 step1[11] = vsubq_s16(step2[8], step2[11]);
287 step1[12] = vsubq_s16(step2[15], step2[12]);
288 step1[13] = vsubq_s16(step2[14], step2[13]);
289 step1[14] = vaddq_s16(step2[14], step2[13]);
290 step1[15] = vaddq_s16(step2[15], step2[12]);
293 step2[0] = vaddq_s16(step1[0], step1[7]);
294 step2[1] = vaddq_s16(step1[1], step1[6]);
295 step2[2] = vaddq_s16(step1[2], step1[5]);
296 step2[3] = vaddq_s16(step1[3], step1[4]);
297 step2[4] = vsubq_s16(step1[3], step1[4]);
298 step2[5] = vsubq_s16(step1[2], step1[5]);
299 step2[6] = vsubq_s16(step1[1], step1[6]);
300 step2[7] = vsubq_s16(step1[0], step1[7]);
301 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
303 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
307 step2[14] = step1[14];
308 step2[15] = step1[15];
311 idct16x16_add_stage7(step2, out);
314 idct16x16_store_pass1(out, output);
317 // pass 2: add the result to dest.
318 const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
319 uint16_t *destT = dest;
320 out[0] = vrshrq_n_s16(out[0], 6);
321 out[1] = vrshrq_n_s16(out[1], 6);
322 out[2] = vrshrq_n_s16(out[2], 6);
323 out[3] = vrshrq_n_s16(out[3], 6);
324 out[4] = vrshrq_n_s16(out[4], 6);
325 out[5] = vrshrq_n_s16(out[5], 6);
326 out[6] = vrshrq_n_s16(out[6], 6);
327 out[7] = vrshrq_n_s16(out[7], 6);
328 out[8] = vrshrq_n_s16(out[8], 6);
329 out[9] = vrshrq_n_s16(out[9], 6);
330 out[10] = vrshrq_n_s16(out[10], 6);
331 out[11] = vrshrq_n_s16(out[11], 6);
332 out[12] = vrshrq_n_s16(out[12], 6);
333 out[13] = vrshrq_n_s16(out[13], 6);
334 out[14] = vrshrq_n_s16(out[14], 6);
335 out[15] = vrshrq_n_s16(out[15], 6);
336 highbd_idct16x16_add8x1(out[0], max, &destT, stride);
337 highbd_idct16x16_add8x1(out[1], max, &destT, stride);
338 highbd_idct16x16_add8x1(out[2], max, &destT, stride);
339 highbd_idct16x16_add8x1(out[3], max, &destT, stride);
340 highbd_idct16x16_add8x1(out[4], max, &destT, stride);
341 highbd_idct16x16_add8x1(out[5], max, &destT, stride);
342 highbd_idct16x16_add8x1(out[6], max, &destT, stride);
343 highbd_idct16x16_add8x1(out[7], max, &destT, stride);
344 highbd_idct16x16_add8x1(out[8], max, &destT, stride);
345 highbd_idct16x16_add8x1(out[9], max, &destT, stride);
346 highbd_idct16x16_add8x1(out[10], max, &destT, stride);
347 highbd_idct16x16_add8x1(out[11], max, &destT, stride);
348 highbd_idct16x16_add8x1(out[12], max, &destT, stride);
349 highbd_idct16x16_add8x1(out[13], max, &destT, stride);
350 highbd_idct16x16_add8x1(out[14], max, &destT, stride);
351 highbd_idct16x16_add8x1(out[15], max, &destT, stride);
353 idct16x16_add_store(out, dest, stride);
358 static void idct16x16_38_add_half1d(const void *const input, int16_t *output,
359 uint8_t *dest, const int stride) {
360 const int16x8_t cospis0 = vld1q_s16(kCospi);
361 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
362 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
363 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
364 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
365 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
366 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
367 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
368 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
369 int16x8_t in[8], step1[16], step2[16], out[16];
373 const tran_low_t *inputT = (const tran_low_t *)input;
374 in[0] = load_tran_low_to_s16q(inputT);
376 in[1] = load_tran_low_to_s16q(inputT);
378 in[2] = load_tran_low_to_s16q(inputT);
380 in[3] = load_tran_low_to_s16q(inputT);
382 in[4] = load_tran_low_to_s16q(inputT);
384 in[5] = load_tran_low_to_s16q(inputT);
386 in[6] = load_tran_low_to_s16q(inputT);
388 in[7] = load_tran_low_to_s16q(inputT);
390 const int16_t *inputT = (const int16_t *)input;
391 in[0] = vld1q_s16(inputT);
393 in[1] = vld1q_s16(inputT);
395 in[2] = vld1q_s16(inputT);
397 in[3] = vld1q_s16(inputT);
399 in[4] = vld1q_s16(inputT);
401 in[5] = vld1q_s16(inputT);
403 in[6] = vld1q_s16(inputT);
405 in[7] = vld1q_s16(inputT);
409 transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
413 step1[0] = in[0 / 2];
414 step1[2] = in[8 / 2];
415 step1[4] = in[4 / 2];
416 step1[6] = in[12 / 2];
417 step1[8] = in[2 / 2];
418 step1[10] = in[10 / 2];
419 step1[12] = in[6 / 2];
420 step1[14] = in[14 / 2]; // 0 in pass 1
427 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
428 step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
429 step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
430 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
431 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
432 step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
433 step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
434 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
439 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
440 step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
441 step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
442 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
443 step1[8] = vaddq_s16(step2[8], step2[9]);
444 step1[9] = vsubq_s16(step2[8], step2[9]);
445 step1[10] = vsubq_s16(step2[11], step2[10]);
446 step1[11] = vaddq_s16(step2[11], step2[10]);
447 step1[12] = vaddq_s16(step2[12], step2[13]);
448 step1[13] = vsubq_s16(step2[12], step2[13]);
449 step1[14] = vsubq_s16(step2[15], step2[14]);
450 step1[15] = vaddq_s16(step2[15], step2[14]);
453 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
454 step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
455 step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
456 step2[4] = vaddq_s16(step1[4], step1[5]);
457 step2[5] = vsubq_s16(step1[4], step1[5]);
458 step2[6] = vsubq_s16(step1[7], step1[6]);
459 step2[7] = vaddq_s16(step1[7], step1[6]);
461 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
463 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
465 step2[11] = step1[11];
466 step2[12] = step1[12];
467 step2[15] = step1[15];
470 step1[0] = vaddq_s16(step2[0], step2[3]);
471 step1[1] = vaddq_s16(step2[1], step2[2]);
472 step1[2] = vsubq_s16(step2[1], step2[2]);
473 step1[3] = vsubq_s16(step2[0], step2[3]);
475 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
477 step1[8] = vaddq_s16(step2[8], step2[11]);
478 step1[9] = vaddq_s16(step2[9], step2[10]);
479 step1[10] = vsubq_s16(step2[9], step2[10]);
480 step1[11] = vsubq_s16(step2[8], step2[11]);
481 step1[12] = vsubq_s16(step2[15], step2[12]);
482 step1[13] = vsubq_s16(step2[14], step2[13]);
483 step1[14] = vaddq_s16(step2[14], step2[13]);
484 step1[15] = vaddq_s16(step2[15], step2[12]);
487 step2[0] = vaddq_s16(step1[0], step1[7]);
488 step2[1] = vaddq_s16(step1[1], step1[6]);
489 step2[2] = vaddq_s16(step1[2], step1[5]);
490 step2[3] = vaddq_s16(step1[3], step1[4]);
491 step2[4] = vsubq_s16(step1[3], step1[4]);
492 step2[5] = vsubq_s16(step1[2], step1[5]);
493 step2[6] = vsubq_s16(step1[1], step1[6]);
494 step2[7] = vsubq_s16(step1[0], step1[7]);
495 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
497 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
501 step2[14] = step1[14];
502 step2[15] = step1[15];
505 idct16x16_add_stage7(step2, out);
508 idct16x16_store_pass1(out, output);
510 idct16x16_add_store(out, dest, stride);
514 static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
516 const int16x8_t cospis0 = vld1q_s16(kCospi);
517 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
518 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
519 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
520 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
521 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
522 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
523 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
524 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
525 int16x4_t in[4], step1[16], step2[16], out[16];
528 in[0] = load_tran_low_to_s16d(input);
530 in[1] = load_tran_low_to_s16d(input);
532 in[2] = load_tran_low_to_s16d(input);
534 in[3] = load_tran_low_to_s16d(input);
537 transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
540 step1[0] = in[0 / 2];
541 step1[4] = in[4 / 2];
542 step1[8] = in[2 / 2];
543 step1[12] = in[6 / 2];
548 step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
549 step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
550 step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
551 step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
555 step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
556 step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
559 step1[10] = step2[11];
560 step1[11] = step2[11];
561 step1[12] = step2[12];
562 step1[13] = step2[12];
563 step1[14] = step2[15];
564 step1[15] = step2[15];
567 step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
573 idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
575 idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
577 step2[11] = step1[11];
578 step2[12] = step1[12];
579 step2[15] = step1[15];
587 idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
589 step1[8] = vadd_s16(step2[8], step2[11]);
590 step1[9] = vadd_s16(step2[9], step2[10]);
591 step1[10] = vsub_s16(step2[9], step2[10]);
592 step1[11] = vsub_s16(step2[8], step2[11]);
593 step1[12] = vsub_s16(step2[15], step2[12]);
594 step1[13] = vsub_s16(step2[14], step2[13]);
595 step1[14] = vadd_s16(step2[14], step2[13]);
596 step1[15] = vadd_s16(step2[15], step2[12]);
599 step2[0] = vadd_s16(step1[0], step1[7]);
600 step2[1] = vadd_s16(step1[1], step1[6]);
601 step2[2] = vadd_s16(step1[2], step1[5]);
602 step2[3] = vadd_s16(step1[3], step1[4]);
603 step2[4] = vsub_s16(step1[3], step1[4]);
604 step2[5] = vsub_s16(step1[2], step1[5]);
605 step2[6] = vsub_s16(step1[1], step1[6]);
606 step2[7] = vsub_s16(step1[0], step1[7]);
607 idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
609 idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
613 step2[14] = step1[14];
614 step2[15] = step1[15];
617 out[0] = vadd_s16(step2[0], step2[15]);
618 out[1] = vadd_s16(step2[1], step2[14]);
619 out[2] = vadd_s16(step2[2], step2[13]);
620 out[3] = vadd_s16(step2[3], step2[12]);
621 out[4] = vadd_s16(step2[4], step2[11]);
622 out[5] = vadd_s16(step2[5], step2[10]);
623 out[6] = vadd_s16(step2[6], step2[9]);
624 out[7] = vadd_s16(step2[7], step2[8]);
625 out[8] = vsub_s16(step2[7], step2[8]);
626 out[9] = vsub_s16(step2[6], step2[9]);
627 out[10] = vsub_s16(step2[5], step2[10]);
628 out[11] = vsub_s16(step2[4], step2[11]);
629 out[12] = vsub_s16(step2[3], step2[12]);
630 out[13] = vsub_s16(step2[2], step2[13]);
631 out[14] = vsub_s16(step2[1], step2[14]);
632 out[15] = vsub_s16(step2[0], step2[15]);
634 // pass 1: save the result into output
635 vst1_s16(output, out[0]);
637 vst1_s16(output, out[1]);
639 vst1_s16(output, out[2]);
641 vst1_s16(output, out[3]);
643 vst1_s16(output, out[4]);
645 vst1_s16(output, out[5]);
647 vst1_s16(output, out[6]);
649 vst1_s16(output, out[7]);
651 vst1_s16(output, out[8]);
653 vst1_s16(output, out[9]);
655 vst1_s16(output, out[10]);
657 vst1_s16(output, out[11]);
659 vst1_s16(output, out[12]);
661 vst1_s16(output, out[13]);
663 vst1_s16(output, out[14]);
665 vst1_s16(output, out[15]);
668 static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
669 uint8_t *dest, const int stride) {
670 const int16x8_t cospis0 = vld1q_s16(kCospi);
671 const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
672 const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
673 const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
674 const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
675 const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
676 const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
677 const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
678 const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
680 int16x8_t in[4], step1[16], step2[16], out[16];
683 ind[0] = vld1_s16(input);
685 ind[1] = vld1_s16(input);
687 ind[2] = vld1_s16(input);
689 ind[3] = vld1_s16(input);
691 ind[4] = vld1_s16(input);
693 ind[5] = vld1_s16(input);
695 ind[6] = vld1_s16(input);
697 ind[7] = vld1_s16(input);
700 transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
701 ind[7], &in[0], &in[1], &in[2], &in[3]);
704 step1[0] = in[0 / 2];
705 step1[4] = in[4 / 2];
706 step1[8] = in[2 / 2];
707 step1[12] = in[6 / 2];
712 step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
713 step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
714 step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
715 step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
719 step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
720 step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
723 step1[10] = step2[11];
724 step1[11] = step2[11];
725 step1[12] = step2[12];
726 step1[13] = step2[12];
727 step1[14] = step2[15];
728 step1[15] = step2[15];
731 step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
737 idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
739 idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
741 step2[11] = step1[11];
742 step2[12] = step1[12];
743 step2[15] = step1[15];
751 idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
753 step1[8] = vaddq_s16(step2[8], step2[11]);
754 step1[9] = vaddq_s16(step2[9], step2[10]);
755 step1[10] = vsubq_s16(step2[9], step2[10]);
756 step1[11] = vsubq_s16(step2[8], step2[11]);
757 step1[12] = vsubq_s16(step2[15], step2[12]);
758 step1[13] = vsubq_s16(step2[14], step2[13]);
759 step1[14] = vaddq_s16(step2[14], step2[13]);
760 step1[15] = vaddq_s16(step2[15], step2[12]);
763 step2[0] = vaddq_s16(step1[0], step1[7]);
764 step2[1] = vaddq_s16(step1[1], step1[6]);
765 step2[2] = vaddq_s16(step1[2], step1[5]);
766 step2[3] = vaddq_s16(step1[3], step1[4]);
767 step2[4] = vsubq_s16(step1[3], step1[4]);
768 step2[5] = vsubq_s16(step1[2], step1[5]);
769 step2[6] = vsubq_s16(step1[1], step1[6]);
770 step2[7] = vsubq_s16(step1[0], step1[7]);
771 idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
773 idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
777 step2[14] = step1[14];
778 step2[15] = step1[15];
781 out[0] = vaddq_s16(step2[0], step2[15]);
782 out[1] = vaddq_s16(step2[1], step2[14]);
783 out[2] = vaddq_s16(step2[2], step2[13]);
784 out[3] = vaddq_s16(step2[3], step2[12]);
785 out[4] = vaddq_s16(step2[4], step2[11]);
786 out[5] = vaddq_s16(step2[5], step2[10]);
787 out[6] = vaddq_s16(step2[6], step2[9]);
788 out[7] = vaddq_s16(step2[7], step2[8]);
789 out[8] = vsubq_s16(step2[7], step2[8]);
790 out[9] = vsubq_s16(step2[6], step2[9]);
791 out[10] = vsubq_s16(step2[5], step2[10]);
792 out[11] = vsubq_s16(step2[4], step2[11]);
793 out[12] = vsubq_s16(step2[3], step2[12]);
794 out[13] = vsubq_s16(step2[2], step2[13]);
795 out[14] = vsubq_s16(step2[1], step2[14]);
796 out[15] = vsubq_s16(step2[0], step2[15]);
799 idct16x16_store_pass1(out, output);
801 idct16x16_add_store(out, dest, stride);
805 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
807 int16_t row_idct_output[16 * 16];
810 // Parallel idct on the upper 8 rows
811 idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
813 // Parallel idct on the lower 8 rows
814 idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride,
818 // Parallel idct to get the left 8 columns
819 idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
821 // Parallel idct to get the right 8 columns
822 idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 0);
825 void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
827 int16_t row_idct_output[16 * 16];
830 // Parallel idct on the upper 8 rows
831 idct16x16_38_add_half1d(input, row_idct_output, dest, stride);
834 // Parallel idct to get the left 8 columns
835 idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride);
837 // Parallel idct to get the right 8 columns
838 idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride);
841 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
843 int16_t row_idct_output[4 * 16];
846 // Parallel idct on the upper 8 rows
847 idct16x16_10_add_half1d_pass1(input, row_idct_output);
850 // Parallel idct to get the left 8 columns
851 idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride);
853 // Parallel idct to get the right 8 columns
854 idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,