2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "vpx_dsp/arm/idct_neon.h"
14 #include "vpx_dsp/txfm_common.h"
16 static void idct16x16_256_add_neon_pass1(const int16x8_t s0, const int16x8_t s1,
17 const int16x8_t s2, const int16x8_t s3,
18 const int16x8_t s4, const int16x8_t s5,
19 const int16x8_t s6, const int16x8_t s7,
21 int16x4_t d0s16, d1s16, d2s16, d3s16;
22 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
23 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
24 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
25 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
26 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
27 int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
28 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
39 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
42 d16s16 = vget_low_s16(q8s16);
43 d17s16 = vget_high_s16(q8s16);
44 d18s16 = vget_low_s16(q9s16);
45 d19s16 = vget_high_s16(q9s16);
46 d20s16 = vget_low_s16(q10s16);
47 d21s16 = vget_high_s16(q10s16);
48 d22s16 = vget_low_s16(q11s16);
49 d23s16 = vget_high_s16(q11s16);
50 d24s16 = vget_low_s16(q12s16);
51 d25s16 = vget_high_s16(q12s16);
52 d26s16 = vget_low_s16(q13s16);
53 d27s16 = vget_high_s16(q13s16);
54 d28s16 = vget_low_s16(q14s16);
55 d29s16 = vget_high_s16(q14s16);
56 d30s16 = vget_low_s16(q15s16);
57 d31s16 = vget_high_s16(q15s16);
60 d0s16 = vdup_n_s16((int16_t)cospi_28_64);
61 d1s16 = vdup_n_s16((int16_t)cospi_4_64);
63 q2s32 = vmull_s16(d18s16, d0s16);
64 q3s32 = vmull_s16(d19s16, d0s16);
65 q5s32 = vmull_s16(d18s16, d1s16);
66 q6s32 = vmull_s16(d19s16, d1s16);
68 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
69 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
70 q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
71 q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
73 d2s16 = vdup_n_s16((int16_t)cospi_12_64);
74 d3s16 = vdup_n_s16((int16_t)cospi_20_64);
76 d8s16 = vqrshrn_n_s32(q2s32, 14);
77 d9s16 = vqrshrn_n_s32(q3s32, 14);
78 d14s16 = vqrshrn_n_s32(q5s32, 14);
79 d15s16 = vqrshrn_n_s32(q6s32, 14);
80 q4s16 = vcombine_s16(d8s16, d9s16);
81 q7s16 = vcombine_s16(d14s16, d15s16);
83 q2s32 = vmull_s16(d26s16, d2s16);
84 q3s32 = vmull_s16(d27s16, d2s16);
85 q9s32 = vmull_s16(d26s16, d3s16);
86 q15s32 = vmull_s16(d27s16, d3s16);
88 q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
89 q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
90 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
91 q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
93 d10s16 = vqrshrn_n_s32(q2s32, 14);
94 d11s16 = vqrshrn_n_s32(q3s32, 14);
95 d12s16 = vqrshrn_n_s32(q9s32, 14);
96 d13s16 = vqrshrn_n_s32(q15s32, 14);
97 q5s16 = vcombine_s16(d10s16, d11s16);
98 q6s16 = vcombine_s16(d12s16, d13s16);
101 d30s16 = vdup_n_s16((int16_t)cospi_16_64);
103 q2s32 = vmull_s16(d16s16, d30s16);
104 q11s32 = vmull_s16(d17s16, d30s16);
105 q0s32 = vmull_s16(d24s16, d30s16);
106 q1s32 = vmull_s16(d25s16, d30s16);
108 d30s16 = vdup_n_s16((int16_t)cospi_24_64);
109 d31s16 = vdup_n_s16((int16_t)cospi_8_64);
111 q3s32 = vaddq_s32(q2s32, q0s32);
112 q12s32 = vaddq_s32(q11s32, q1s32);
113 q13s32 = vsubq_s32(q2s32, q0s32);
114 q1s32 = vsubq_s32(q11s32, q1s32);
116 d16s16 = vqrshrn_n_s32(q3s32, 14);
117 d17s16 = vqrshrn_n_s32(q12s32, 14);
118 d18s16 = vqrshrn_n_s32(q13s32, 14);
119 d19s16 = vqrshrn_n_s32(q1s32, 14);
120 q8s16 = vcombine_s16(d16s16, d17s16);
121 q9s16 = vcombine_s16(d18s16, d19s16);
123 q0s32 = vmull_s16(d20s16, d31s16);
124 q1s32 = vmull_s16(d21s16, d31s16);
125 q12s32 = vmull_s16(d20s16, d30s16);
126 q13s32 = vmull_s16(d21s16, d30s16);
128 q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
129 q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
130 q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
131 q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
133 d22s16 = vqrshrn_n_s32(q0s32, 14);
134 d23s16 = vqrshrn_n_s32(q1s32, 14);
135 d20s16 = vqrshrn_n_s32(q12s32, 14);
136 d21s16 = vqrshrn_n_s32(q13s32, 14);
137 q10s16 = vcombine_s16(d20s16, d21s16);
138 q11s16 = vcombine_s16(d22s16, d23s16);
140 q13s16 = vsubq_s16(q4s16, q5s16);
141 q4s16 = vaddq_s16(q4s16, q5s16);
142 q14s16 = vsubq_s16(q7s16, q6s16);
143 q15s16 = vaddq_s16(q6s16, q7s16);
144 d26s16 = vget_low_s16(q13s16);
145 d27s16 = vget_high_s16(q13s16);
146 d28s16 = vget_low_s16(q14s16);
147 d29s16 = vget_high_s16(q14s16);
150 q0s16 = vaddq_s16(q8s16, q11s16);
151 q1s16 = vaddq_s16(q9s16, q10s16);
152 q2s16 = vsubq_s16(q9s16, q10s16);
153 q3s16 = vsubq_s16(q8s16, q11s16);
155 d16s16 = vdup_n_s16((int16_t)cospi_16_64);
157 q11s32 = vmull_s16(d26s16, d16s16);
158 q12s32 = vmull_s16(d27s16, d16s16);
159 q9s32 = vmull_s16(d28s16, d16s16);
160 q10s32 = vmull_s16(d29s16, d16s16);
162 q6s32 = vsubq_s32(q9s32, q11s32);
163 q13s32 = vsubq_s32(q10s32, q12s32);
164 q9s32 = vaddq_s32(q9s32, q11s32);
165 q10s32 = vaddq_s32(q10s32, q12s32);
167 d10s16 = vqrshrn_n_s32(q6s32, 14);
168 d11s16 = vqrshrn_n_s32(q13s32, 14);
169 d12s16 = vqrshrn_n_s32(q9s32, 14);
170 d13s16 = vqrshrn_n_s32(q10s32, 14);
171 q5s16 = vcombine_s16(d10s16, d11s16);
172 q6s16 = vcombine_s16(d12s16, d13s16);
175 q8s16 = vaddq_s16(q0s16, q15s16);
176 q9s16 = vaddq_s16(q1s16, q6s16);
177 q10s16 = vaddq_s16(q2s16, q5s16);
178 q11s16 = vaddq_s16(q3s16, q4s16);
179 q12s16 = vsubq_s16(q3s16, q4s16);
180 q13s16 = vsubq_s16(q2s16, q5s16);
181 q14s16 = vsubq_s16(q1s16, q6s16);
182 q15s16 = vsubq_s16(q0s16, q15s16);
185 vst1q_s16(out, q8s16);
187 vst1q_s16(out, q9s16);
189 vst1q_s16(out, q10s16);
191 vst1q_s16(out, q11s16);
193 vst1q_s16(out, q12s16);
195 vst1q_s16(out, q13s16);
197 vst1q_s16(out, q14s16);
199 vst1q_s16(out, q15s16);
202 void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) {
203 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
230 idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out);
233 #if CONFIG_VP9_HIGHBITDEPTH
234 void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *in,
236 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
239 v = load_tran_low_to_s16x2q(in);
242 v = load_tran_low_to_s16x2q(in);
245 v = load_tran_low_to_s16x2q(in);
248 v = load_tran_low_to_s16x2q(in);
251 v = load_tran_low_to_s16x2q(in);
254 v = load_tran_low_to_s16x2q(in);
257 v = load_tran_low_to_s16x2q(in);
260 v = load_tran_low_to_s16x2q(in);
263 idct16x16_256_add_neon_pass1(s0, s1, s2, s3, s4, s5, s6, s7, out);
265 #endif // CONFIG_VP9_HIGHBITDEPTH
267 static void idct16x16_256_add_neon_pass2(const int16x8_t s0, const int16x8_t s1,
268 const int16x8_t s2, const int16x8_t s3,
269 const int16x8_t s4, const int16x8_t s5,
270 const int16x8_t s6, const int16x8_t s7,
271 int16_t *out, int16_t *pass1_output,
272 int16_t skip_adding, uint8_t *dest,
275 uint8x8_t d12u8, d13u8;
276 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
277 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
278 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
279 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
280 uint64x1_t d24u64, d25u64, d26u64, d27u64;
281 int64x1_t d12s64, d13s64;
282 uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
283 uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
284 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
285 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
286 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
287 int32x4_t q10s32, q11s32, q12s32, q13s32;
298 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
301 d16s16 = vget_low_s16(q8s16);
302 d17s16 = vget_high_s16(q8s16);
303 d18s16 = vget_low_s16(q9s16);
304 d19s16 = vget_high_s16(q9s16);
305 d20s16 = vget_low_s16(q10s16);
306 d21s16 = vget_high_s16(q10s16);
307 d22s16 = vget_low_s16(q11s16);
308 d23s16 = vget_high_s16(q11s16);
309 d24s16 = vget_low_s16(q12s16);
310 d25s16 = vget_high_s16(q12s16);
311 d26s16 = vget_low_s16(q13s16);
312 d27s16 = vget_high_s16(q13s16);
313 d28s16 = vget_low_s16(q14s16);
314 d29s16 = vget_high_s16(q14s16);
315 d30s16 = vget_low_s16(q15s16);
316 d31s16 = vget_high_s16(q15s16);
319 d12s16 = vdup_n_s16((int16_t)cospi_30_64);
320 d13s16 = vdup_n_s16((int16_t)cospi_2_64);
322 q2s32 = vmull_s16(d16s16, d12s16);
323 q3s32 = vmull_s16(d17s16, d12s16);
324 q1s32 = vmull_s16(d16s16, d13s16);
325 q4s32 = vmull_s16(d17s16, d13s16);
327 q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
328 q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
329 q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
330 q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
332 d0s16 = vqrshrn_n_s32(q2s32, 14);
333 d1s16 = vqrshrn_n_s32(q3s32, 14);
334 d14s16 = vqrshrn_n_s32(q1s32, 14);
335 d15s16 = vqrshrn_n_s32(q4s32, 14);
336 q0s16 = vcombine_s16(d0s16, d1s16);
337 q7s16 = vcombine_s16(d14s16, d15s16);
339 d30s16 = vdup_n_s16((int16_t)cospi_14_64);
340 d31s16 = vdup_n_s16((int16_t)cospi_18_64);
342 q2s32 = vmull_s16(d24s16, d30s16);
343 q3s32 = vmull_s16(d25s16, d30s16);
344 q4s32 = vmull_s16(d24s16, d31s16);
345 q5s32 = vmull_s16(d25s16, d31s16);
347 q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
348 q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
349 q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
350 q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
352 d2s16 = vqrshrn_n_s32(q2s32, 14);
353 d3s16 = vqrshrn_n_s32(q3s32, 14);
354 d12s16 = vqrshrn_n_s32(q4s32, 14);
355 d13s16 = vqrshrn_n_s32(q5s32, 14);
356 q1s16 = vcombine_s16(d2s16, d3s16);
357 q6s16 = vcombine_s16(d12s16, d13s16);
359 d30s16 = vdup_n_s16((int16_t)cospi_22_64);
360 d31s16 = vdup_n_s16((int16_t)cospi_10_64);
362 q11s32 = vmull_s16(d20s16, d30s16);
363 q12s32 = vmull_s16(d21s16, d30s16);
364 q4s32 = vmull_s16(d20s16, d31s16);
365 q5s32 = vmull_s16(d21s16, d31s16);
367 q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
368 q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
369 q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
370 q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
372 d4s16 = vqrshrn_n_s32(q11s32, 14);
373 d5s16 = vqrshrn_n_s32(q12s32, 14);
374 d11s16 = vqrshrn_n_s32(q5s32, 14);
375 d10s16 = vqrshrn_n_s32(q4s32, 14);
376 q2s16 = vcombine_s16(d4s16, d5s16);
377 q5s16 = vcombine_s16(d10s16, d11s16);
379 d30s16 = vdup_n_s16((int16_t)cospi_6_64);
380 d31s16 = vdup_n_s16((int16_t)cospi_26_64);
382 q10s32 = vmull_s16(d28s16, d30s16);
383 q11s32 = vmull_s16(d29s16, d30s16);
384 q12s32 = vmull_s16(d28s16, d31s16);
385 q13s32 = vmull_s16(d29s16, d31s16);
387 q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
388 q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
389 q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
390 q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
392 d6s16 = vqrshrn_n_s32(q10s32, 14);
393 d7s16 = vqrshrn_n_s32(q11s32, 14);
394 d8s16 = vqrshrn_n_s32(q12s32, 14);
395 d9s16 = vqrshrn_n_s32(q13s32, 14);
396 q3s16 = vcombine_s16(d6s16, d7s16);
397 q4s16 = vcombine_s16(d8s16, d9s16);
400 q9s16 = vsubq_s16(q0s16, q1s16);
401 q0s16 = vaddq_s16(q0s16, q1s16);
402 q10s16 = vsubq_s16(q3s16, q2s16);
403 q11s16 = vaddq_s16(q2s16, q3s16);
404 q12s16 = vaddq_s16(q4s16, q5s16);
405 q13s16 = vsubq_s16(q4s16, q5s16);
406 q14s16 = vsubq_s16(q7s16, q6s16);
407 q7s16 = vaddq_s16(q6s16, q7s16);
410 d18s16 = vget_low_s16(q9s16);
411 d19s16 = vget_high_s16(q9s16);
412 d20s16 = vget_low_s16(q10s16);
413 d21s16 = vget_high_s16(q10s16);
414 d26s16 = vget_low_s16(q13s16);
415 d27s16 = vget_high_s16(q13s16);
416 d28s16 = vget_low_s16(q14s16);
417 d29s16 = vget_high_s16(q14s16);
419 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
420 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
422 q2s32 = vmull_s16(d18s16, d31s16);
423 q3s32 = vmull_s16(d19s16, d31s16);
424 q4s32 = vmull_s16(d28s16, d31s16);
425 q5s32 = vmull_s16(d29s16, d31s16);
427 q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
428 q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
429 q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
430 q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
432 d12s16 = vqrshrn_n_s32(q2s32, 14);
433 d13s16 = vqrshrn_n_s32(q3s32, 14);
434 d2s16 = vqrshrn_n_s32(q4s32, 14);
435 d3s16 = vqrshrn_n_s32(q5s32, 14);
436 q1s16 = vcombine_s16(d2s16, d3s16);
437 q6s16 = vcombine_s16(d12s16, d13s16);
442 d30s16 = vdup_n_s16(-cospi_8_64);
443 q11s32 = vmull_s16(d26s16, d30s16);
444 q12s32 = vmull_s16(d27s16, d30s16);
445 q8s32 = vmull_s16(d20s16, d30s16);
446 q9s32 = vmull_s16(d21s16, d30s16);
448 q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
449 q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
450 q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
451 q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
453 d4s16 = vqrshrn_n_s32(q11s32, 14);
454 d5s16 = vqrshrn_n_s32(q12s32, 14);
455 d10s16 = vqrshrn_n_s32(q8s32, 14);
456 d11s16 = vqrshrn_n_s32(q9s32, 14);
457 q2s16 = vcombine_s16(d4s16, d5s16);
458 q5s16 = vcombine_s16(d10s16, d11s16);
461 q8s16 = vaddq_s16(q0s16, q3s16);
462 q9s16 = vaddq_s16(q1s16, q2s16);
463 q10s16 = vsubq_s16(q1s16, q2s16);
464 q11s16 = vsubq_s16(q0s16, q3s16);
465 q12s16 = vsubq_s16(q7s16, q4s16);
466 q13s16 = vsubq_s16(q6s16, q5s16);
467 q14s16 = vaddq_s16(q6s16, q5s16);
468 q15s16 = vaddq_s16(q7s16, q4s16);
471 d20s16 = vget_low_s16(q10s16);
472 d21s16 = vget_high_s16(q10s16);
473 d22s16 = vget_low_s16(q11s16);
474 d23s16 = vget_high_s16(q11s16);
475 d24s16 = vget_low_s16(q12s16);
476 d25s16 = vget_high_s16(q12s16);
477 d26s16 = vget_low_s16(q13s16);
478 d27s16 = vget_high_s16(q13s16);
480 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
482 q3s32 = vmull_s16(d26s16, d14s16);
483 q4s32 = vmull_s16(d27s16, d14s16);
484 q0s32 = vmull_s16(d20s16, d14s16);
485 q1s32 = vmull_s16(d21s16, d14s16);
487 q5s32 = vsubq_s32(q3s32, q0s32);
488 q6s32 = vsubq_s32(q4s32, q1s32);
489 q10s32 = vaddq_s32(q3s32, q0s32);
490 q4s32 = vaddq_s32(q4s32, q1s32);
492 d4s16 = vqrshrn_n_s32(q5s32, 14);
493 d5s16 = vqrshrn_n_s32(q6s32, 14);
494 d10s16 = vqrshrn_n_s32(q10s32, 14);
495 d11s16 = vqrshrn_n_s32(q4s32, 14);
496 q2s16 = vcombine_s16(d4s16, d5s16);
497 q5s16 = vcombine_s16(d10s16, d11s16);
499 q0s32 = vmull_s16(d22s16, d14s16);
500 q1s32 = vmull_s16(d23s16, d14s16);
501 q13s32 = vmull_s16(d24s16, d14s16);
502 q6s32 = vmull_s16(d25s16, d14s16);
504 q10s32 = vsubq_s32(q13s32, q0s32);
505 q4s32 = vsubq_s32(q6s32, q1s32);
506 q13s32 = vaddq_s32(q13s32, q0s32);
507 q6s32 = vaddq_s32(q6s32, q1s32);
509 d6s16 = vqrshrn_n_s32(q10s32, 14);
510 d7s16 = vqrshrn_n_s32(q4s32, 14);
511 d8s16 = vqrshrn_n_s32(q13s32, 14);
512 d9s16 = vqrshrn_n_s32(q6s32, 14);
513 q3s16 = vcombine_s16(d6s16, d7s16);
514 q4s16 = vcombine_s16(d8s16, d9s16);
517 if (skip_adding != 0) {
519 // load the data in pass1
520 q0s16 = vld1q_s16(pass1_output);
522 q1s16 = vld1q_s16(pass1_output);
524 d12s64 = vld1_s64((int64_t *)dest);
526 d13s64 = vld1_s64((int64_t *)dest);
529 q12s16 = vaddq_s16(q0s16, q15s16);
530 q13s16 = vaddq_s16(q1s16, q14s16);
531 q12s16 = vrshrq_n_s16(q12s16, 6);
532 q13s16 = vrshrq_n_s16(q13s16, 6);
534 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
536 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
537 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
538 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
539 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
541 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
543 q14s16 = vsubq_s16(q1s16, q14s16);
544 q15s16 = vsubq_s16(q0s16, q15s16);
546 q10s16 = vld1q_s16(pass1_output);
548 q11s16 = vld1q_s16(pass1_output);
550 d12s64 = vld1_s64((int64_t *)dest);
552 d13s64 = vld1_s64((int64_t *)dest);
554 q12s16 = vaddq_s16(q10s16, q5s16);
555 q13s16 = vaddq_s16(q11s16, q4s16);
556 q12s16 = vrshrq_n_s16(q12s16, 6);
557 q13s16 = vrshrq_n_s16(q13s16, 6);
559 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
561 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
562 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
563 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
564 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
566 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
568 q4s16 = vsubq_s16(q11s16, q4s16);
569 q5s16 = vsubq_s16(q10s16, q5s16);
571 q0s16 = vld1q_s16(pass1_output);
573 q1s16 = vld1q_s16(pass1_output);
575 d12s64 = vld1_s64((int64_t *)dest);
577 d13s64 = vld1_s64((int64_t *)dest);
579 q12s16 = vaddq_s16(q0s16, q3s16);
580 q13s16 = vaddq_s16(q1s16, q2s16);
581 q12s16 = vrshrq_n_s16(q12s16, 6);
582 q13s16 = vrshrq_n_s16(q13s16, 6);
584 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
586 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
587 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
588 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
589 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
591 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
593 q2s16 = vsubq_s16(q1s16, q2s16);
594 q3s16 = vsubq_s16(q0s16, q3s16);
596 q10s16 = vld1q_s16(pass1_output);
598 q11s16 = vld1q_s16(pass1_output);
599 d12s64 = vld1_s64((int64_t *)dest);
601 d13s64 = vld1_s64((int64_t *)dest);
603 q12s16 = vaddq_s16(q10s16, q9s16);
604 q13s16 = vaddq_s16(q11s16, q8s16);
605 q12s16 = vrshrq_n_s16(q12s16, 6);
606 q13s16 = vrshrq_n_s16(q13s16, 6);
608 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
610 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
611 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
612 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
613 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
615 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
617 q8s16 = vsubq_s16(q11s16, q8s16);
618 q9s16 = vsubq_s16(q10s16, q9s16);
620 // store the data out 8,9,10,11,12,13,14,15
621 d12s64 = vld1_s64((int64_t *)dest);
623 q8s16 = vrshrq_n_s16(q8s16, 6);
624 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
625 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
626 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
629 d12s64 = vld1_s64((int64_t *)dest);
631 q9s16 = vrshrq_n_s16(q9s16, 6);
632 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
633 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
634 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
637 d12s64 = vld1_s64((int64_t *)dest);
639 q2s16 = vrshrq_n_s16(q2s16, 6);
640 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
641 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
642 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
645 d12s64 = vld1_s64((int64_t *)dest);
647 q3s16 = vrshrq_n_s16(q3s16, 6);
648 q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
649 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
650 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
653 d12s64 = vld1_s64((int64_t *)dest);
655 q4s16 = vrshrq_n_s16(q4s16, 6);
656 q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
657 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
658 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
661 d12s64 = vld1_s64((int64_t *)dest);
663 q5s16 = vrshrq_n_s16(q5s16, 6);
664 q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
665 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
666 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
669 d12s64 = vld1_s64((int64_t *)dest);
671 q14s16 = vrshrq_n_s16(q14s16, 6);
673 vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
674 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
675 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
678 d12s64 = vld1_s64((int64_t *)dest);
679 q15s16 = vrshrq_n_s16(q15s16, 6);
681 vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
682 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
683 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
684 } else { // skip_adding_dest
685 q0s16 = vld1q_s16(pass1_output);
687 q1s16 = vld1q_s16(pass1_output);
689 q12s16 = vaddq_s16(q0s16, q15s16);
690 q13s16 = vaddq_s16(q1s16, q14s16);
691 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
692 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
693 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
694 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
695 vst1_u64((uint64_t *)out, d24u64);
697 vst1_u64((uint64_t *)out, d25u64);
699 vst1_u64((uint64_t *)out, d26u64);
701 vst1_u64((uint64_t *)out, d27u64);
703 q14s16 = vsubq_s16(q1s16, q14s16);
704 q15s16 = vsubq_s16(q0s16, q15s16);
706 q10s16 = vld1q_s16(pass1_output);
708 q11s16 = vld1q_s16(pass1_output);
710 q12s16 = vaddq_s16(q10s16, q5s16);
711 q13s16 = vaddq_s16(q11s16, q4s16);
712 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
713 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
714 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
715 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
716 vst1_u64((uint64_t *)out, d24u64);
718 vst1_u64((uint64_t *)out, d25u64);
720 vst1_u64((uint64_t *)out, d26u64);
722 vst1_u64((uint64_t *)out, d27u64);
724 q4s16 = vsubq_s16(q11s16, q4s16);
725 q5s16 = vsubq_s16(q10s16, q5s16);
727 q0s16 = vld1q_s16(pass1_output);
729 q1s16 = vld1q_s16(pass1_output);
731 q12s16 = vaddq_s16(q0s16, q3s16);
732 q13s16 = vaddq_s16(q1s16, q2s16);
733 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
734 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
735 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
736 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
737 vst1_u64((uint64_t *)out, d24u64);
739 vst1_u64((uint64_t *)out, d25u64);
741 vst1_u64((uint64_t *)out, d26u64);
743 vst1_u64((uint64_t *)out, d27u64);
745 q2s16 = vsubq_s16(q1s16, q2s16);
746 q3s16 = vsubq_s16(q0s16, q3s16);
748 q10s16 = vld1q_s16(pass1_output);
750 q11s16 = vld1q_s16(pass1_output);
752 q12s16 = vaddq_s16(q10s16, q9s16);
753 q13s16 = vaddq_s16(q11s16, q8s16);
754 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
755 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
756 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
757 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
758 vst1_u64((uint64_t *)out, d24u64);
760 vst1_u64((uint64_t *)out, d25u64);
762 vst1_u64((uint64_t *)out, d26u64);
764 vst1_u64((uint64_t *)out, d27u64);
766 q8s16 = vsubq_s16(q11s16, q8s16);
767 q9s16 = vsubq_s16(q10s16, q9s16);
769 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
771 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
773 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
775 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
777 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
779 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
781 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
783 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
785 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
787 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
789 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
791 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
793 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
795 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
797 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
799 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
803 void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
804 int16_t *pass1_output,
805 int16_t skip_adding, uint8_t *dest,
807 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
810 q0x2s16 = vld2q_s16(src);
811 q8s16 = q0x2s16.val[0];
813 q0x2s16 = vld2q_s16(src);
814 q9s16 = q0x2s16.val[0];
816 q0x2s16 = vld2q_s16(src);
817 q10s16 = q0x2s16.val[0];
819 q0x2s16 = vld2q_s16(src);
820 q11s16 = q0x2s16.val[0];
822 q0x2s16 = vld2q_s16(src);
823 q12s16 = q0x2s16.val[0];
825 q0x2s16 = vld2q_s16(src);
826 q13s16 = q0x2s16.val[0];
828 q0x2s16 = vld2q_s16(src);
829 q14s16 = q0x2s16.val[0];
831 q0x2s16 = vld2q_s16(src);
832 q15s16 = q0x2s16.val[0];
834 idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
835 q14s16, q15s16, out, pass1_output, skip_adding,
839 #if CONFIG_VP9_HIGHBITDEPTH
840 void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
842 int16_t *pass1_output,
844 uint8_t *dest, int dest_stride) {
845 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
848 q0x2s16 = load_tran_low_to_s16x2q(src);
849 q8s16 = q0x2s16.val[0];
851 q0x2s16 = load_tran_low_to_s16x2q(src);
852 q9s16 = q0x2s16.val[0];
854 q0x2s16 = load_tran_low_to_s16x2q(src);
855 q10s16 = q0x2s16.val[0];
857 q0x2s16 = load_tran_low_to_s16x2q(src);
858 q11s16 = q0x2s16.val[0];
860 q0x2s16 = load_tran_low_to_s16x2q(src);
861 q12s16 = q0x2s16.val[0];
863 q0x2s16 = load_tran_low_to_s16x2q(src);
864 q13s16 = q0x2s16.val[0];
866 q0x2s16 = load_tran_low_to_s16x2q(src);
867 q14s16 = q0x2s16.val[0];
869 q0x2s16 = load_tran_low_to_s16x2q(src);
870 q15s16 = q0x2s16.val[0];
872 idct16x16_256_add_neon_pass2(q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,
873 q14s16, q15s16, out, pass1_output, skip_adding,
876 #endif // CONFIG_VP9_HIGHBITDEPTH
878 void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *in, int16_t *out) {
880 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
881 int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
882 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
883 int32x4_t q6s32, q9s32;
884 int32x4_t q10s32, q11s32, q12s32, q15s32;
887 q0x2s16 = load_tran_low_to_s16x2q(in);
888 q8s16 = q0x2s16.val[0];
890 q0x2s16 = load_tran_low_to_s16x2q(in);
891 q9s16 = q0x2s16.val[0];
893 q0x2s16 = load_tran_low_to_s16x2q(in);
894 q10s16 = q0x2s16.val[0];
896 q0x2s16 = load_tran_low_to_s16x2q(in);
897 q11s16 = q0x2s16.val[0];
899 q0x2s16 = load_tran_low_to_s16x2q(in);
900 q12s16 = q0x2s16.val[0];
902 q0x2s16 = load_tran_low_to_s16x2q(in);
903 q13s16 = q0x2s16.val[0];
905 q0x2s16 = load_tran_low_to_s16x2q(in);
906 q14s16 = q0x2s16.val[0];
908 q0x2s16 = load_tran_low_to_s16x2q(in);
909 q15s16 = q0x2s16.val[0];
911 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
915 q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
916 q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
918 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
919 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
922 q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
923 d4s16 = vdup_n_s16((int16_t)cospi_16_64);
925 q8s16 = vqrdmulhq_s16(q8s16, q1s16);
927 d8s16 = vget_low_s16(q4s16);
928 d9s16 = vget_high_s16(q4s16);
929 d14s16 = vget_low_s16(q7s16);
930 d15s16 = vget_high_s16(q7s16);
931 q9s32 = vmull_s16(d14s16, d4s16);
932 q10s32 = vmull_s16(d15s16, d4s16);
933 q12s32 = vmull_s16(d9s16, d4s16);
934 q11s32 = vmull_s16(d8s16, d4s16);
936 q15s32 = vsubq_s32(q10s32, q12s32);
937 q6s32 = vsubq_s32(q9s32, q11s32);
938 q9s32 = vaddq_s32(q9s32, q11s32);
939 q10s32 = vaddq_s32(q10s32, q12s32);
941 d11s16 = vqrshrn_n_s32(q15s32, 14);
942 d10s16 = vqrshrn_n_s32(q6s32, 14);
943 d12s16 = vqrshrn_n_s32(q9s32, 14);
944 d13s16 = vqrshrn_n_s32(q10s32, 14);
945 q5s16 = vcombine_s16(d10s16, d11s16);
946 q6s16 = vcombine_s16(d12s16, d13s16);
949 q2s16 = vaddq_s16(q8s16, q7s16);
950 q9s16 = vaddq_s16(q8s16, q6s16);
951 q10s16 = vaddq_s16(q8s16, q5s16);
952 q11s16 = vaddq_s16(q8s16, q4s16);
953 q12s16 = vsubq_s16(q8s16, q4s16);
954 q13s16 = vsubq_s16(q8s16, q5s16);
955 q14s16 = vsubq_s16(q8s16, q6s16);
956 q15s16 = vsubq_s16(q8s16, q7s16);
959 vst1q_s16(out, q2s16);
961 vst1q_s16(out, q9s16);
963 vst1q_s16(out, q10s16);
965 vst1q_s16(out, q11s16);
967 vst1q_s16(out, q12s16);
969 vst1q_s16(out, q13s16);
971 vst1q_s16(out, q14s16);
973 vst1q_s16(out, q15s16);
976 void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *out,
977 int16_t *pass1_output) {
978 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
979 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
980 int16x4_t d20s16, d21s16, d22s16, d23s16;
981 int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
982 uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
983 uint64x1_t d16u64, d17u64, d18u64, d19u64;
984 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
985 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
986 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
987 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
988 int32x4_t q10s32, q11s32, q12s32, q13s32;
991 q0x2s16 = load_tran_low_to_s16x2q(src);
992 q8s16 = q0x2s16.val[0];
994 q0x2s16 = load_tran_low_to_s16x2q(src);
995 q9s16 = q0x2s16.val[0];
997 q0x2s16 = load_tran_low_to_s16x2q(src);
998 q10s16 = q0x2s16.val[0];
1000 q0x2s16 = load_tran_low_to_s16x2q(src);
1001 q11s16 = q0x2s16.val[0];
1003 q0x2s16 = load_tran_low_to_s16x2q(src);
1004 q12s16 = q0x2s16.val[0];
1006 q0x2s16 = load_tran_low_to_s16x2q(src);
1007 q13s16 = q0x2s16.val[0];
1009 q0x2s16 = load_tran_low_to_s16x2q(src);
1010 q14s16 = q0x2s16.val[0];
1012 q0x2s16 = load_tran_low_to_s16x2q(src);
1013 q15s16 = q0x2s16.val[0];
1015 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
1019 q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
1020 q0s16 = vqrdmulhq_s16(q8s16, q6s16);
1021 q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
1022 q7s16 = vqrdmulhq_s16(q8s16, q6s16);
1024 q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
1025 q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
1026 q3s16 = vqrdmulhq_s16(q9s16, q15s16);
1027 q4s16 = vqrdmulhq_s16(q9s16, q14s16);
1030 d0s16 = vget_low_s16(q0s16);
1031 d1s16 = vget_high_s16(q0s16);
1032 d6s16 = vget_low_s16(q3s16);
1033 d7s16 = vget_high_s16(q3s16);
1034 d8s16 = vget_low_s16(q4s16);
1035 d9s16 = vget_high_s16(q4s16);
1036 d14s16 = vget_low_s16(q7s16);
1037 d15s16 = vget_high_s16(q7s16);
1039 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
1040 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
1042 q12s32 = vmull_s16(d14s16, d31s16);
1043 q5s32 = vmull_s16(d15s16, d31s16);
1044 q2s32 = vmull_s16(d0s16, d31s16);
1045 q11s32 = vmull_s16(d1s16, d31s16);
1047 q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
1048 q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
1049 q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
1050 q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
1052 d2s16 = vqrshrn_n_s32(q12s32, 14);
1053 d3s16 = vqrshrn_n_s32(q5s32, 14);
1054 d12s16 = vqrshrn_n_s32(q2s32, 14);
1055 d13s16 = vqrshrn_n_s32(q11s32, 14);
1056 q1s16 = vcombine_s16(d2s16, d3s16);
1057 q6s16 = vcombine_s16(d12s16, d13s16);
1059 d30s16 = vdup_n_s16(-cospi_8_64);
1060 q10s32 = vmull_s16(d8s16, d30s16);
1061 q13s32 = vmull_s16(d9s16, d30s16);
1062 q8s32 = vmull_s16(d6s16, d30s16);
1063 q9s32 = vmull_s16(d7s16, d30s16);
1065 q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
1066 q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
1067 q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
1068 q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
1070 d4s16 = vqrshrn_n_s32(q10s32, 14);
1071 d5s16 = vqrshrn_n_s32(q13s32, 14);
1072 d10s16 = vqrshrn_n_s32(q8s32, 14);
1073 d11s16 = vqrshrn_n_s32(q9s32, 14);
1074 q2s16 = vcombine_s16(d4s16, d5s16);
1075 q5s16 = vcombine_s16(d10s16, d11s16);
1078 q8s16 = vaddq_s16(q0s16, q3s16);
1079 q9s16 = vaddq_s16(q1s16, q2s16);
1080 q10s16 = vsubq_s16(q1s16, q2s16);
1081 q11s16 = vsubq_s16(q0s16, q3s16);
1082 q12s16 = vsubq_s16(q7s16, q4s16);
1083 q13s16 = vsubq_s16(q6s16, q5s16);
1084 q14s16 = vaddq_s16(q6s16, q5s16);
1085 q15s16 = vaddq_s16(q7s16, q4s16);
1088 d20s16 = vget_low_s16(q10s16);
1089 d21s16 = vget_high_s16(q10s16);
1090 d22s16 = vget_low_s16(q11s16);
1091 d23s16 = vget_high_s16(q11s16);
1092 d24s16 = vget_low_s16(q12s16);
1093 d25s16 = vget_high_s16(q12s16);
1094 d26s16 = vget_low_s16(q13s16);
1095 d27s16 = vget_high_s16(q13s16);
1097 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
1098 q3s32 = vmull_s16(d26s16, d14s16);
1099 q4s32 = vmull_s16(d27s16, d14s16);
1100 q0s32 = vmull_s16(d20s16, d14s16);
1101 q1s32 = vmull_s16(d21s16, d14s16);
1103 q5s32 = vsubq_s32(q3s32, q0s32);
1104 q6s32 = vsubq_s32(q4s32, q1s32);
1105 q0s32 = vaddq_s32(q3s32, q0s32);
1106 q4s32 = vaddq_s32(q4s32, q1s32);
1108 d4s16 = vqrshrn_n_s32(q5s32, 14);
1109 d5s16 = vqrshrn_n_s32(q6s32, 14);
1110 d10s16 = vqrshrn_n_s32(q0s32, 14);
1111 d11s16 = vqrshrn_n_s32(q4s32, 14);
1112 q2s16 = vcombine_s16(d4s16, d5s16);
1113 q5s16 = vcombine_s16(d10s16, d11s16);
1115 q0s32 = vmull_s16(d22s16, d14s16);
1116 q1s32 = vmull_s16(d23s16, d14s16);
1117 q13s32 = vmull_s16(d24s16, d14s16);
1118 q6s32 = vmull_s16(d25s16, d14s16);
1120 q10s32 = vsubq_s32(q13s32, q0s32);
1121 q4s32 = vsubq_s32(q6s32, q1s32);
1122 q13s32 = vaddq_s32(q13s32, q0s32);
1123 q6s32 = vaddq_s32(q6s32, q1s32);
1125 d6s16 = vqrshrn_n_s32(q10s32, 14);
1126 d7s16 = vqrshrn_n_s32(q4s32, 14);
1127 d8s16 = vqrshrn_n_s32(q13s32, 14);
1128 d9s16 = vqrshrn_n_s32(q6s32, 14);
1129 q3s16 = vcombine_s16(d6s16, d7s16);
1130 q4s16 = vcombine_s16(d8s16, d9s16);
1133 q0s16 = vld1q_s16(pass1_output);
1135 q1s16 = vld1q_s16(pass1_output);
1137 q12s16 = vaddq_s16(q0s16, q15s16);
1138 q13s16 = vaddq_s16(q1s16, q14s16);
1139 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1140 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1141 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1142 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1143 vst1_u64((uint64_t *)out, d24u64);
1145 vst1_u64((uint64_t *)out, d25u64);
1147 vst1_u64((uint64_t *)out, d26u64);
1149 vst1_u64((uint64_t *)out, d27u64);
1151 q14s16 = vsubq_s16(q1s16, q14s16);
1152 q15s16 = vsubq_s16(q0s16, q15s16);
1154 q10s16 = vld1q_s16(pass1_output);
1156 q11s16 = vld1q_s16(pass1_output);
1158 q12s16 = vaddq_s16(q10s16, q5s16);
1159 q13s16 = vaddq_s16(q11s16, q4s16);
1160 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1161 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1162 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1163 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1164 vst1_u64((uint64_t *)out, d24u64);
1166 vst1_u64((uint64_t *)out, d25u64);
1168 vst1_u64((uint64_t *)out, d26u64);
1170 vst1_u64((uint64_t *)out, d27u64);
1172 q4s16 = vsubq_s16(q11s16, q4s16);
1173 q5s16 = vsubq_s16(q10s16, q5s16);
1175 q0s16 = vld1q_s16(pass1_output);
1177 q1s16 = vld1q_s16(pass1_output);
1179 q12s16 = vaddq_s16(q0s16, q3s16);
1180 q13s16 = vaddq_s16(q1s16, q2s16);
1181 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1182 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1183 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1184 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1185 vst1_u64((uint64_t *)out, d24u64);
1187 vst1_u64((uint64_t *)out, d25u64);
1189 vst1_u64((uint64_t *)out, d26u64);
1191 vst1_u64((uint64_t *)out, d27u64);
1193 q2s16 = vsubq_s16(q1s16, q2s16);
1194 q3s16 = vsubq_s16(q0s16, q3s16);
1196 q10s16 = vld1q_s16(pass1_output);
1198 q11s16 = vld1q_s16(pass1_output);
1199 q12s16 = vaddq_s16(q10s16, q9s16);
1200 q13s16 = vaddq_s16(q11s16, q8s16);
1201 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1202 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1203 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1204 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1205 vst1_u64((uint64_t *)out, d24u64);
1207 vst1_u64((uint64_t *)out, d25u64);
1209 vst1_u64((uint64_t *)out, d26u64);
1211 vst1_u64((uint64_t *)out, d27u64);
1213 q8s16 = vsubq_s16(q11s16, q8s16);
1214 q9s16 = vsubq_s16(q10s16, q9s16);
1216 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
1217 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
1218 d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
1219 d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
1220 d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
1221 d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
1222 d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1223 d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1224 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1225 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1226 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1227 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1228 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1229 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1230 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1231 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1233 vst1_u64((uint64_t *)out, d16u64);
1235 vst1_u64((uint64_t *)out, d17u64);
1237 vst1_u64((uint64_t *)out, d18u64);
1239 vst1_u64((uint64_t *)out, d19u64);
1241 vst1_u64((uint64_t *)out, d4u64);
1243 vst1_u64((uint64_t *)out, d5u64);
1245 vst1_u64((uint64_t *)out, d6u64);
1247 vst1_u64((uint64_t *)out, d7u64);
1249 vst1_u64((uint64_t *)out, d8u64);
1251 vst1_u64((uint64_t *)out, d9u64);
1253 vst1_u64((uint64_t *)out, d10u64);
1255 vst1_u64((uint64_t *)out, d11u64);
1257 vst1_u64((uint64_t *)out, d28u64);
1259 vst1_u64((uint64_t *)out, d29u64);
1261 vst1_u64((uint64_t *)out, d30u64);
1263 vst1_u64((uint64_t *)out, d31u64);