2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "vpx_dsp/arm/transpose_neon.h"
15 #include "vpx_dsp/txfm_common.h"
17 void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out,
19 int16x4_t d0s16, d1s16, d2s16, d3s16;
20 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
21 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
22 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
23 uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
24 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
25 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
26 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
27 int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
28 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
31 q0x2s16 = vld2q_s16(in);
32 q8s16 = q0x2s16.val[0];
34 q0x2s16 = vld2q_s16(in);
35 q9s16 = q0x2s16.val[0];
37 q0x2s16 = vld2q_s16(in);
38 q10s16 = q0x2s16.val[0];
40 q0x2s16 = vld2q_s16(in);
41 q11s16 = q0x2s16.val[0];
43 q0x2s16 = vld2q_s16(in);
44 q12s16 = q0x2s16.val[0];
46 q0x2s16 = vld2q_s16(in);
47 q13s16 = q0x2s16.val[0];
49 q0x2s16 = vld2q_s16(in);
50 q14s16 = q0x2s16.val[0];
52 q0x2s16 = vld2q_s16(in);
53 q15s16 = q0x2s16.val[0];
55 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
58 d16s16 = vget_low_s16(q8s16);
59 d17s16 = vget_high_s16(q8s16);
60 d18s16 = vget_low_s16(q9s16);
61 d19s16 = vget_high_s16(q9s16);
62 d20s16 = vget_low_s16(q10s16);
63 d21s16 = vget_high_s16(q10s16);
64 d22s16 = vget_low_s16(q11s16);
65 d23s16 = vget_high_s16(q11s16);
66 d24s16 = vget_low_s16(q12s16);
67 d25s16 = vget_high_s16(q12s16);
68 d26s16 = vget_low_s16(q13s16);
69 d27s16 = vget_high_s16(q13s16);
70 d28s16 = vget_low_s16(q14s16);
71 d29s16 = vget_high_s16(q14s16);
72 d30s16 = vget_low_s16(q15s16);
73 d31s16 = vget_high_s16(q15s16);
76 d0s16 = vdup_n_s16((int16_t)cospi_28_64);
77 d1s16 = vdup_n_s16((int16_t)cospi_4_64);
79 q2s32 = vmull_s16(d18s16, d0s16);
80 q3s32 = vmull_s16(d19s16, d0s16);
81 q5s32 = vmull_s16(d18s16, d1s16);
82 q6s32 = vmull_s16(d19s16, d1s16);
84 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
85 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
86 q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
87 q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
89 d2s16 = vdup_n_s16((int16_t)cospi_12_64);
90 d3s16 = vdup_n_s16((int16_t)cospi_20_64);
92 d8s16 = vqrshrn_n_s32(q2s32, 14);
93 d9s16 = vqrshrn_n_s32(q3s32, 14);
94 d14s16 = vqrshrn_n_s32(q5s32, 14);
95 d15s16 = vqrshrn_n_s32(q6s32, 14);
96 q4s16 = vcombine_s16(d8s16, d9s16);
97 q7s16 = vcombine_s16(d14s16, d15s16);
99 q2s32 = vmull_s16(d26s16, d2s16);
100 q3s32 = vmull_s16(d27s16, d2s16);
101 q9s32 = vmull_s16(d26s16, d3s16);
102 q15s32 = vmull_s16(d27s16, d3s16);
104 q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
105 q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
106 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
107 q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
109 d10s16 = vqrshrn_n_s32(q2s32, 14);
110 d11s16 = vqrshrn_n_s32(q3s32, 14);
111 d12s16 = vqrshrn_n_s32(q9s32, 14);
112 d13s16 = vqrshrn_n_s32(q15s32, 14);
113 q5s16 = vcombine_s16(d10s16, d11s16);
114 q6s16 = vcombine_s16(d12s16, d13s16);
117 d30s16 = vdup_n_s16((int16_t)cospi_16_64);
119 q2s32 = vmull_s16(d16s16, d30s16);
120 q11s32 = vmull_s16(d17s16, d30s16);
121 q0s32 = vmull_s16(d24s16, d30s16);
122 q1s32 = vmull_s16(d25s16, d30s16);
124 d30s16 = vdup_n_s16((int16_t)cospi_24_64);
125 d31s16 = vdup_n_s16((int16_t)cospi_8_64);
127 q3s32 = vaddq_s32(q2s32, q0s32);
128 q12s32 = vaddq_s32(q11s32, q1s32);
129 q13s32 = vsubq_s32(q2s32, q0s32);
130 q1s32 = vsubq_s32(q11s32, q1s32);
132 d16s16 = vqrshrn_n_s32(q3s32, 14);
133 d17s16 = vqrshrn_n_s32(q12s32, 14);
134 d18s16 = vqrshrn_n_s32(q13s32, 14);
135 d19s16 = vqrshrn_n_s32(q1s32, 14);
136 q8s16 = vcombine_s16(d16s16, d17s16);
137 q9s16 = vcombine_s16(d18s16, d19s16);
139 q0s32 = vmull_s16(d20s16, d31s16);
140 q1s32 = vmull_s16(d21s16, d31s16);
141 q12s32 = vmull_s16(d20s16, d30s16);
142 q13s32 = vmull_s16(d21s16, d30s16);
144 q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
145 q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
146 q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
147 q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
149 d22s16 = vqrshrn_n_s32(q0s32, 14);
150 d23s16 = vqrshrn_n_s32(q1s32, 14);
151 d20s16 = vqrshrn_n_s32(q12s32, 14);
152 d21s16 = vqrshrn_n_s32(q13s32, 14);
153 q10s16 = vcombine_s16(d20s16, d21s16);
154 q11s16 = vcombine_s16(d22s16, d23s16);
156 q13s16 = vsubq_s16(q4s16, q5s16);
157 q4s16 = vaddq_s16(q4s16, q5s16);
158 q14s16 = vsubq_s16(q7s16, q6s16);
159 q15s16 = vaddq_s16(q6s16, q7s16);
160 d26s16 = vget_low_s16(q13s16);
161 d27s16 = vget_high_s16(q13s16);
162 d28s16 = vget_low_s16(q14s16);
163 d29s16 = vget_high_s16(q14s16);
166 q0s16 = vaddq_s16(q8s16, q11s16);
167 q1s16 = vaddq_s16(q9s16, q10s16);
168 q2s16 = vsubq_s16(q9s16, q10s16);
169 q3s16 = vsubq_s16(q8s16, q11s16);
171 d16s16 = vdup_n_s16((int16_t)cospi_16_64);
173 q11s32 = vmull_s16(d26s16, d16s16);
174 q12s32 = vmull_s16(d27s16, d16s16);
175 q9s32 = vmull_s16(d28s16, d16s16);
176 q10s32 = vmull_s16(d29s16, d16s16);
178 q6s32 = vsubq_s32(q9s32, q11s32);
179 q13s32 = vsubq_s32(q10s32, q12s32);
180 q9s32 = vaddq_s32(q9s32, q11s32);
181 q10s32 = vaddq_s32(q10s32, q12s32);
183 d10s16 = vqrshrn_n_s32(q6s32, 14);
184 d11s16 = vqrshrn_n_s32(q13s32, 14);
185 d12s16 = vqrshrn_n_s32(q9s32, 14);
186 d13s16 = vqrshrn_n_s32(q10s32, 14);
187 q5s16 = vcombine_s16(d10s16, d11s16);
188 q6s16 = vcombine_s16(d12s16, d13s16);
191 q8s16 = vaddq_s16(q0s16, q15s16);
192 q9s16 = vaddq_s16(q1s16, q6s16);
193 q10s16 = vaddq_s16(q2s16, q5s16);
194 q11s16 = vaddq_s16(q3s16, q4s16);
195 q12s16 = vsubq_s16(q3s16, q4s16);
196 q13s16 = vsubq_s16(q2s16, q5s16);
197 q14s16 = vsubq_s16(q1s16, q6s16);
198 q15s16 = vsubq_s16(q0s16, q15s16);
200 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
201 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
202 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
203 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
204 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
205 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
206 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
207 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
208 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
209 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
210 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
211 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
212 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
213 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
214 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
215 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
218 output_stride >>= 1; // output_stride / 2, out is int16_t
219 vst1_u64((uint64_t *)out, d16u64);
220 out += output_stride;
221 vst1_u64((uint64_t *)out, d17u64);
222 out += output_stride;
223 vst1_u64((uint64_t *)out, d18u64);
224 out += output_stride;
225 vst1_u64((uint64_t *)out, d19u64);
226 out += output_stride;
227 vst1_u64((uint64_t *)out, d20u64);
228 out += output_stride;
229 vst1_u64((uint64_t *)out, d21u64);
230 out += output_stride;
231 vst1_u64((uint64_t *)out, d22u64);
232 out += output_stride;
233 vst1_u64((uint64_t *)out, d23u64);
234 out += output_stride;
235 vst1_u64((uint64_t *)out, d24u64);
236 out += output_stride;
237 vst1_u64((uint64_t *)out, d25u64);
238 out += output_stride;
239 vst1_u64((uint64_t *)out, d26u64);
240 out += output_stride;
241 vst1_u64((uint64_t *)out, d27u64);
242 out += output_stride;
243 vst1_u64((uint64_t *)out, d28u64);
244 out += output_stride;
245 vst1_u64((uint64_t *)out, d29u64);
246 out += output_stride;
247 vst1_u64((uint64_t *)out, d30u64);
248 out += output_stride;
249 vst1_u64((uint64_t *)out, d31u64);
252 void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
253 int16_t *pass1_output,
254 int16_t skip_adding, uint8_t *dest,
257 uint8x8_t d12u8, d13u8;
258 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
259 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
260 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
261 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
262 uint64x1_t d24u64, d25u64, d26u64, d27u64;
263 int64x1_t d12s64, d13s64;
264 uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
265 uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
266 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
267 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
268 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
269 int32x4_t q10s32, q11s32, q12s32, q13s32;
272 q0x2s16 = vld2q_s16(src);
273 q8s16 = q0x2s16.val[0];
275 q0x2s16 = vld2q_s16(src);
276 q9s16 = q0x2s16.val[0];
278 q0x2s16 = vld2q_s16(src);
279 q10s16 = q0x2s16.val[0];
281 q0x2s16 = vld2q_s16(src);
282 q11s16 = q0x2s16.val[0];
284 q0x2s16 = vld2q_s16(src);
285 q12s16 = q0x2s16.val[0];
287 q0x2s16 = vld2q_s16(src);
288 q13s16 = q0x2s16.val[0];
290 q0x2s16 = vld2q_s16(src);
291 q14s16 = q0x2s16.val[0];
293 q0x2s16 = vld2q_s16(src);
294 q15s16 = q0x2s16.val[0];
296 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
299 d16s16 = vget_low_s16(q8s16);
300 d17s16 = vget_high_s16(q8s16);
301 d18s16 = vget_low_s16(q9s16);
302 d19s16 = vget_high_s16(q9s16);
303 d20s16 = vget_low_s16(q10s16);
304 d21s16 = vget_high_s16(q10s16);
305 d22s16 = vget_low_s16(q11s16);
306 d23s16 = vget_high_s16(q11s16);
307 d24s16 = vget_low_s16(q12s16);
308 d25s16 = vget_high_s16(q12s16);
309 d26s16 = vget_low_s16(q13s16);
310 d27s16 = vget_high_s16(q13s16);
311 d28s16 = vget_low_s16(q14s16);
312 d29s16 = vget_high_s16(q14s16);
313 d30s16 = vget_low_s16(q15s16);
314 d31s16 = vget_high_s16(q15s16);
317 d12s16 = vdup_n_s16((int16_t)cospi_30_64);
318 d13s16 = vdup_n_s16((int16_t)cospi_2_64);
320 q2s32 = vmull_s16(d16s16, d12s16);
321 q3s32 = vmull_s16(d17s16, d12s16);
322 q1s32 = vmull_s16(d16s16, d13s16);
323 q4s32 = vmull_s16(d17s16, d13s16);
325 q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
326 q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
327 q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
328 q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
330 d0s16 = vqrshrn_n_s32(q2s32, 14);
331 d1s16 = vqrshrn_n_s32(q3s32, 14);
332 d14s16 = vqrshrn_n_s32(q1s32, 14);
333 d15s16 = vqrshrn_n_s32(q4s32, 14);
334 q0s16 = vcombine_s16(d0s16, d1s16);
335 q7s16 = vcombine_s16(d14s16, d15s16);
337 d30s16 = vdup_n_s16((int16_t)cospi_14_64);
338 d31s16 = vdup_n_s16((int16_t)cospi_18_64);
340 q2s32 = vmull_s16(d24s16, d30s16);
341 q3s32 = vmull_s16(d25s16, d30s16);
342 q4s32 = vmull_s16(d24s16, d31s16);
343 q5s32 = vmull_s16(d25s16, d31s16);
345 q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
346 q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
347 q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
348 q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
350 d2s16 = vqrshrn_n_s32(q2s32, 14);
351 d3s16 = vqrshrn_n_s32(q3s32, 14);
352 d12s16 = vqrshrn_n_s32(q4s32, 14);
353 d13s16 = vqrshrn_n_s32(q5s32, 14);
354 q1s16 = vcombine_s16(d2s16, d3s16);
355 q6s16 = vcombine_s16(d12s16, d13s16);
357 d30s16 = vdup_n_s16((int16_t)cospi_22_64);
358 d31s16 = vdup_n_s16((int16_t)cospi_10_64);
360 q11s32 = vmull_s16(d20s16, d30s16);
361 q12s32 = vmull_s16(d21s16, d30s16);
362 q4s32 = vmull_s16(d20s16, d31s16);
363 q5s32 = vmull_s16(d21s16, d31s16);
365 q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
366 q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
367 q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
368 q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
370 d4s16 = vqrshrn_n_s32(q11s32, 14);
371 d5s16 = vqrshrn_n_s32(q12s32, 14);
372 d11s16 = vqrshrn_n_s32(q5s32, 14);
373 d10s16 = vqrshrn_n_s32(q4s32, 14);
374 q2s16 = vcombine_s16(d4s16, d5s16);
375 q5s16 = vcombine_s16(d10s16, d11s16);
377 d30s16 = vdup_n_s16((int16_t)cospi_6_64);
378 d31s16 = vdup_n_s16((int16_t)cospi_26_64);
380 q10s32 = vmull_s16(d28s16, d30s16);
381 q11s32 = vmull_s16(d29s16, d30s16);
382 q12s32 = vmull_s16(d28s16, d31s16);
383 q13s32 = vmull_s16(d29s16, d31s16);
385 q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
386 q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
387 q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
388 q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
390 d6s16 = vqrshrn_n_s32(q10s32, 14);
391 d7s16 = vqrshrn_n_s32(q11s32, 14);
392 d8s16 = vqrshrn_n_s32(q12s32, 14);
393 d9s16 = vqrshrn_n_s32(q13s32, 14);
394 q3s16 = vcombine_s16(d6s16, d7s16);
395 q4s16 = vcombine_s16(d8s16, d9s16);
398 q9s16 = vsubq_s16(q0s16, q1s16);
399 q0s16 = vaddq_s16(q0s16, q1s16);
400 q10s16 = vsubq_s16(q3s16, q2s16);
401 q11s16 = vaddq_s16(q2s16, q3s16);
402 q12s16 = vaddq_s16(q4s16, q5s16);
403 q13s16 = vsubq_s16(q4s16, q5s16);
404 q14s16 = vsubq_s16(q7s16, q6s16);
405 q7s16 = vaddq_s16(q6s16, q7s16);
408 d18s16 = vget_low_s16(q9s16);
409 d19s16 = vget_high_s16(q9s16);
410 d20s16 = vget_low_s16(q10s16);
411 d21s16 = vget_high_s16(q10s16);
412 d26s16 = vget_low_s16(q13s16);
413 d27s16 = vget_high_s16(q13s16);
414 d28s16 = vget_low_s16(q14s16);
415 d29s16 = vget_high_s16(q14s16);
417 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
418 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
420 q2s32 = vmull_s16(d18s16, d31s16);
421 q3s32 = vmull_s16(d19s16, d31s16);
422 q4s32 = vmull_s16(d28s16, d31s16);
423 q5s32 = vmull_s16(d29s16, d31s16);
425 q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
426 q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
427 q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
428 q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
430 d12s16 = vqrshrn_n_s32(q2s32, 14);
431 d13s16 = vqrshrn_n_s32(q3s32, 14);
432 d2s16 = vqrshrn_n_s32(q4s32, 14);
433 d3s16 = vqrshrn_n_s32(q5s32, 14);
434 q1s16 = vcombine_s16(d2s16, d3s16);
435 q6s16 = vcombine_s16(d12s16, d13s16);
440 d30s16 = vdup_n_s16(-cospi_8_64);
441 q11s32 = vmull_s16(d26s16, d30s16);
442 q12s32 = vmull_s16(d27s16, d30s16);
443 q8s32 = vmull_s16(d20s16, d30s16);
444 q9s32 = vmull_s16(d21s16, d30s16);
446 q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
447 q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
448 q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
449 q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
451 d4s16 = vqrshrn_n_s32(q11s32, 14);
452 d5s16 = vqrshrn_n_s32(q12s32, 14);
453 d10s16 = vqrshrn_n_s32(q8s32, 14);
454 d11s16 = vqrshrn_n_s32(q9s32, 14);
455 q2s16 = vcombine_s16(d4s16, d5s16);
456 q5s16 = vcombine_s16(d10s16, d11s16);
459 q8s16 = vaddq_s16(q0s16, q3s16);
460 q9s16 = vaddq_s16(q1s16, q2s16);
461 q10s16 = vsubq_s16(q1s16, q2s16);
462 q11s16 = vsubq_s16(q0s16, q3s16);
463 q12s16 = vsubq_s16(q7s16, q4s16);
464 q13s16 = vsubq_s16(q6s16, q5s16);
465 q14s16 = vaddq_s16(q6s16, q5s16);
466 q15s16 = vaddq_s16(q7s16, q4s16);
469 d20s16 = vget_low_s16(q10s16);
470 d21s16 = vget_high_s16(q10s16);
471 d22s16 = vget_low_s16(q11s16);
472 d23s16 = vget_high_s16(q11s16);
473 d24s16 = vget_low_s16(q12s16);
474 d25s16 = vget_high_s16(q12s16);
475 d26s16 = vget_low_s16(q13s16);
476 d27s16 = vget_high_s16(q13s16);
478 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
480 q3s32 = vmull_s16(d26s16, d14s16);
481 q4s32 = vmull_s16(d27s16, d14s16);
482 q0s32 = vmull_s16(d20s16, d14s16);
483 q1s32 = vmull_s16(d21s16, d14s16);
485 q5s32 = vsubq_s32(q3s32, q0s32);
486 q6s32 = vsubq_s32(q4s32, q1s32);
487 q10s32 = vaddq_s32(q3s32, q0s32);
488 q4s32 = vaddq_s32(q4s32, q1s32);
490 d4s16 = vqrshrn_n_s32(q5s32, 14);
491 d5s16 = vqrshrn_n_s32(q6s32, 14);
492 d10s16 = vqrshrn_n_s32(q10s32, 14);
493 d11s16 = vqrshrn_n_s32(q4s32, 14);
494 q2s16 = vcombine_s16(d4s16, d5s16);
495 q5s16 = vcombine_s16(d10s16, d11s16);
497 q0s32 = vmull_s16(d22s16, d14s16);
498 q1s32 = vmull_s16(d23s16, d14s16);
499 q13s32 = vmull_s16(d24s16, d14s16);
500 q6s32 = vmull_s16(d25s16, d14s16);
502 q10s32 = vsubq_s32(q13s32, q0s32);
503 q4s32 = vsubq_s32(q6s32, q1s32);
504 q13s32 = vaddq_s32(q13s32, q0s32);
505 q6s32 = vaddq_s32(q6s32, q1s32);
507 d6s16 = vqrshrn_n_s32(q10s32, 14);
508 d7s16 = vqrshrn_n_s32(q4s32, 14);
509 d8s16 = vqrshrn_n_s32(q13s32, 14);
510 d9s16 = vqrshrn_n_s32(q6s32, 14);
511 q3s16 = vcombine_s16(d6s16, d7s16);
512 q4s16 = vcombine_s16(d8s16, d9s16);
515 if (skip_adding != 0) {
517 // load the data in pass1
518 q0s16 = vld1q_s16(pass1_output);
520 q1s16 = vld1q_s16(pass1_output);
522 d12s64 = vld1_s64((int64_t *)dest);
524 d13s64 = vld1_s64((int64_t *)dest);
527 q12s16 = vaddq_s16(q0s16, q15s16);
528 q13s16 = vaddq_s16(q1s16, q14s16);
529 q12s16 = vrshrq_n_s16(q12s16, 6);
530 q13s16 = vrshrq_n_s16(q13s16, 6);
532 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
534 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
535 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
536 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
537 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
539 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
541 q14s16 = vsubq_s16(q1s16, q14s16);
542 q15s16 = vsubq_s16(q0s16, q15s16);
544 q10s16 = vld1q_s16(pass1_output);
546 q11s16 = vld1q_s16(pass1_output);
548 d12s64 = vld1_s64((int64_t *)dest);
550 d13s64 = vld1_s64((int64_t *)dest);
552 q12s16 = vaddq_s16(q10s16, q5s16);
553 q13s16 = vaddq_s16(q11s16, q4s16);
554 q12s16 = vrshrq_n_s16(q12s16, 6);
555 q13s16 = vrshrq_n_s16(q13s16, 6);
557 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
559 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
560 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
561 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
562 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
564 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
566 q4s16 = vsubq_s16(q11s16, q4s16);
567 q5s16 = vsubq_s16(q10s16, q5s16);
569 q0s16 = vld1q_s16(pass1_output);
571 q1s16 = vld1q_s16(pass1_output);
573 d12s64 = vld1_s64((int64_t *)dest);
575 d13s64 = vld1_s64((int64_t *)dest);
577 q12s16 = vaddq_s16(q0s16, q3s16);
578 q13s16 = vaddq_s16(q1s16, q2s16);
579 q12s16 = vrshrq_n_s16(q12s16, 6);
580 q13s16 = vrshrq_n_s16(q13s16, 6);
582 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
584 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
585 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
586 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
587 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
589 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
591 q2s16 = vsubq_s16(q1s16, q2s16);
592 q3s16 = vsubq_s16(q0s16, q3s16);
594 q10s16 = vld1q_s16(pass1_output);
596 q11s16 = vld1q_s16(pass1_output);
597 d12s64 = vld1_s64((int64_t *)dest);
599 d13s64 = vld1_s64((int64_t *)dest);
601 q12s16 = vaddq_s16(q10s16, q9s16);
602 q13s16 = vaddq_s16(q11s16, q8s16);
603 q12s16 = vrshrq_n_s16(q12s16, 6);
604 q13s16 = vrshrq_n_s16(q13s16, 6);
606 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
608 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
609 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
610 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
611 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
613 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
615 q8s16 = vsubq_s16(q11s16, q8s16);
616 q9s16 = vsubq_s16(q10s16, q9s16);
618 // store the data out 8,9,10,11,12,13,14,15
619 d12s64 = vld1_s64((int64_t *)dest);
621 q8s16 = vrshrq_n_s16(q8s16, 6);
622 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
623 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
624 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
627 d12s64 = vld1_s64((int64_t *)dest);
629 q9s16 = vrshrq_n_s16(q9s16, 6);
630 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
631 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
632 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
635 d12s64 = vld1_s64((int64_t *)dest);
637 q2s16 = vrshrq_n_s16(q2s16, 6);
638 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
639 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
640 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
643 d12s64 = vld1_s64((int64_t *)dest);
645 q3s16 = vrshrq_n_s16(q3s16, 6);
646 q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
647 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
648 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
651 d12s64 = vld1_s64((int64_t *)dest);
653 q4s16 = vrshrq_n_s16(q4s16, 6);
654 q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
655 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
656 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
659 d12s64 = vld1_s64((int64_t *)dest);
661 q5s16 = vrshrq_n_s16(q5s16, 6);
662 q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
663 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
664 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
667 d12s64 = vld1_s64((int64_t *)dest);
669 q14s16 = vrshrq_n_s16(q14s16, 6);
671 vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
672 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
673 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
676 d12s64 = vld1_s64((int64_t *)dest);
677 q15s16 = vrshrq_n_s16(q15s16, 6);
679 vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
680 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
681 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
682 } else { // skip_adding_dest
683 q0s16 = vld1q_s16(pass1_output);
685 q1s16 = vld1q_s16(pass1_output);
687 q12s16 = vaddq_s16(q0s16, q15s16);
688 q13s16 = vaddq_s16(q1s16, q14s16);
689 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
690 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
691 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
692 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
693 vst1_u64((uint64_t *)out, d24u64);
695 vst1_u64((uint64_t *)out, d25u64);
697 vst1_u64((uint64_t *)out, d26u64);
699 vst1_u64((uint64_t *)out, d27u64);
701 q14s16 = vsubq_s16(q1s16, q14s16);
702 q15s16 = vsubq_s16(q0s16, q15s16);
704 q10s16 = vld1q_s16(pass1_output);
706 q11s16 = vld1q_s16(pass1_output);
708 q12s16 = vaddq_s16(q10s16, q5s16);
709 q13s16 = vaddq_s16(q11s16, q4s16);
710 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
711 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
712 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
713 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
714 vst1_u64((uint64_t *)out, d24u64);
716 vst1_u64((uint64_t *)out, d25u64);
718 vst1_u64((uint64_t *)out, d26u64);
720 vst1_u64((uint64_t *)out, d27u64);
722 q4s16 = vsubq_s16(q11s16, q4s16);
723 q5s16 = vsubq_s16(q10s16, q5s16);
725 q0s16 = vld1q_s16(pass1_output);
727 q1s16 = vld1q_s16(pass1_output);
729 q12s16 = vaddq_s16(q0s16, q3s16);
730 q13s16 = vaddq_s16(q1s16, q2s16);
731 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
732 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
733 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
734 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
735 vst1_u64((uint64_t *)out, d24u64);
737 vst1_u64((uint64_t *)out, d25u64);
739 vst1_u64((uint64_t *)out, d26u64);
741 vst1_u64((uint64_t *)out, d27u64);
743 q2s16 = vsubq_s16(q1s16, q2s16);
744 q3s16 = vsubq_s16(q0s16, q3s16);
746 q10s16 = vld1q_s16(pass1_output);
748 q11s16 = vld1q_s16(pass1_output);
750 q12s16 = vaddq_s16(q10s16, q9s16);
751 q13s16 = vaddq_s16(q11s16, q8s16);
752 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
753 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
754 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
755 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
756 vst1_u64((uint64_t *)out, d24u64);
758 vst1_u64((uint64_t *)out, d25u64);
760 vst1_u64((uint64_t *)out, d26u64);
762 vst1_u64((uint64_t *)out, d27u64);
764 q8s16 = vsubq_s16(q11s16, q8s16);
765 q9s16 = vsubq_s16(q10s16, q9s16);
767 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
769 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
771 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
773 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
775 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
777 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
779 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
781 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
783 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
785 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
787 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
789 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
791 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
793 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
795 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
797 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
801 void vpx_idct16x16_10_add_neon_pass1(const int16_t *in, int16_t *out,
804 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
805 uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
806 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
807 int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
808 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
809 int32x4_t q6s32, q9s32;
810 int32x4_t q10s32, q11s32, q12s32, q15s32;
813 q0x2s16 = vld2q_s16(in);
814 q8s16 = q0x2s16.val[0];
816 q0x2s16 = vld2q_s16(in);
817 q9s16 = q0x2s16.val[0];
819 q0x2s16 = vld2q_s16(in);
820 q10s16 = q0x2s16.val[0];
822 q0x2s16 = vld2q_s16(in);
823 q11s16 = q0x2s16.val[0];
825 q0x2s16 = vld2q_s16(in);
826 q12s16 = q0x2s16.val[0];
828 q0x2s16 = vld2q_s16(in);
829 q13s16 = q0x2s16.val[0];
831 q0x2s16 = vld2q_s16(in);
832 q14s16 = q0x2s16.val[0];
834 q0x2s16 = vld2q_s16(in);
835 q15s16 = q0x2s16.val[0];
837 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
841 q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
842 q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
844 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
845 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
848 q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
849 d4s16 = vdup_n_s16((int16_t)cospi_16_64);
851 q8s16 = vqrdmulhq_s16(q8s16, q1s16);
853 d8s16 = vget_low_s16(q4s16);
854 d9s16 = vget_high_s16(q4s16);
855 d14s16 = vget_low_s16(q7s16);
856 d15s16 = vget_high_s16(q7s16);
857 q9s32 = vmull_s16(d14s16, d4s16);
858 q10s32 = vmull_s16(d15s16, d4s16);
859 q12s32 = vmull_s16(d9s16, d4s16);
860 q11s32 = vmull_s16(d8s16, d4s16);
862 q15s32 = vsubq_s32(q10s32, q12s32);
863 q6s32 = vsubq_s32(q9s32, q11s32);
864 q9s32 = vaddq_s32(q9s32, q11s32);
865 q10s32 = vaddq_s32(q10s32, q12s32);
867 d11s16 = vqrshrn_n_s32(q15s32, 14);
868 d10s16 = vqrshrn_n_s32(q6s32, 14);
869 d12s16 = vqrshrn_n_s32(q9s32, 14);
870 d13s16 = vqrshrn_n_s32(q10s32, 14);
871 q5s16 = vcombine_s16(d10s16, d11s16);
872 q6s16 = vcombine_s16(d12s16, d13s16);
875 q2s16 = vaddq_s16(q8s16, q7s16);
876 q9s16 = vaddq_s16(q8s16, q6s16);
877 q10s16 = vaddq_s16(q8s16, q5s16);
878 q11s16 = vaddq_s16(q8s16, q4s16);
879 q12s16 = vsubq_s16(q8s16, q4s16);
880 q13s16 = vsubq_s16(q8s16, q5s16);
881 q14s16 = vsubq_s16(q8s16, q6s16);
882 q15s16 = vsubq_s16(q8s16, q7s16);
884 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
885 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
886 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
887 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
888 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
889 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
890 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
891 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
892 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
893 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
894 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
895 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
896 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
897 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
898 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
899 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
902 output_stride >>= 1; // output_stride / 2, out is int16_t
903 vst1_u64((uint64_t *)out, d4u64);
904 out += output_stride;
905 vst1_u64((uint64_t *)out, d5u64);
906 out += output_stride;
907 vst1_u64((uint64_t *)out, d18u64);
908 out += output_stride;
909 vst1_u64((uint64_t *)out, d19u64);
910 out += output_stride;
911 vst1_u64((uint64_t *)out, d20u64);
912 out += output_stride;
913 vst1_u64((uint64_t *)out, d21u64);
914 out += output_stride;
915 vst1_u64((uint64_t *)out, d22u64);
916 out += output_stride;
917 vst1_u64((uint64_t *)out, d23u64);
918 out += output_stride;
919 vst1_u64((uint64_t *)out, d24u64);
920 out += output_stride;
921 vst1_u64((uint64_t *)out, d25u64);
922 out += output_stride;
923 vst1_u64((uint64_t *)out, d26u64);
924 out += output_stride;
925 vst1_u64((uint64_t *)out, d27u64);
926 out += output_stride;
927 vst1_u64((uint64_t *)out, d28u64);
928 out += output_stride;
929 vst1_u64((uint64_t *)out, d29u64);
930 out += output_stride;
931 vst1_u64((uint64_t *)out, d30u64);
932 out += output_stride;
933 vst1_u64((uint64_t *)out, d31u64);
936 void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *out,
937 int16_t *pass1_output) {
938 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
939 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
940 int16x4_t d20s16, d21s16, d22s16, d23s16;
941 int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
942 uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
943 uint64x1_t d16u64, d17u64, d18u64, d19u64;
944 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
945 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
946 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
947 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
948 int32x4_t q10s32, q11s32, q12s32, q13s32;
951 q0x2s16 = vld2q_s16(src);
952 q8s16 = q0x2s16.val[0];
954 q0x2s16 = vld2q_s16(src);
955 q9s16 = q0x2s16.val[0];
957 q0x2s16 = vld2q_s16(src);
958 q10s16 = q0x2s16.val[0];
960 q0x2s16 = vld2q_s16(src);
961 q11s16 = q0x2s16.val[0];
963 q0x2s16 = vld2q_s16(src);
964 q12s16 = q0x2s16.val[0];
966 q0x2s16 = vld2q_s16(src);
967 q13s16 = q0x2s16.val[0];
969 q0x2s16 = vld2q_s16(src);
970 q14s16 = q0x2s16.val[0];
972 q0x2s16 = vld2q_s16(src);
973 q15s16 = q0x2s16.val[0];
975 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
979 q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
980 q0s16 = vqrdmulhq_s16(q8s16, q6s16);
981 q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
982 q7s16 = vqrdmulhq_s16(q8s16, q6s16);
984 q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
985 q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
986 q3s16 = vqrdmulhq_s16(q9s16, q15s16);
987 q4s16 = vqrdmulhq_s16(q9s16, q14s16);
990 d0s16 = vget_low_s16(q0s16);
991 d1s16 = vget_high_s16(q0s16);
992 d6s16 = vget_low_s16(q3s16);
993 d7s16 = vget_high_s16(q3s16);
994 d8s16 = vget_low_s16(q4s16);
995 d9s16 = vget_high_s16(q4s16);
996 d14s16 = vget_low_s16(q7s16);
997 d15s16 = vget_high_s16(q7s16);
999 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
1000 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
1002 q12s32 = vmull_s16(d14s16, d31s16);
1003 q5s32 = vmull_s16(d15s16, d31s16);
1004 q2s32 = vmull_s16(d0s16, d31s16);
1005 q11s32 = vmull_s16(d1s16, d31s16);
1007 q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
1008 q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
1009 q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
1010 q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
1012 d2s16 = vqrshrn_n_s32(q12s32, 14);
1013 d3s16 = vqrshrn_n_s32(q5s32, 14);
1014 d12s16 = vqrshrn_n_s32(q2s32, 14);
1015 d13s16 = vqrshrn_n_s32(q11s32, 14);
1016 q1s16 = vcombine_s16(d2s16, d3s16);
1017 q6s16 = vcombine_s16(d12s16, d13s16);
1019 d30s16 = vdup_n_s16(-cospi_8_64);
1020 q10s32 = vmull_s16(d8s16, d30s16);
1021 q13s32 = vmull_s16(d9s16, d30s16);
1022 q8s32 = vmull_s16(d6s16, d30s16);
1023 q9s32 = vmull_s16(d7s16, d30s16);
1025 q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
1026 q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
1027 q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
1028 q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
1030 d4s16 = vqrshrn_n_s32(q10s32, 14);
1031 d5s16 = vqrshrn_n_s32(q13s32, 14);
1032 d10s16 = vqrshrn_n_s32(q8s32, 14);
1033 d11s16 = vqrshrn_n_s32(q9s32, 14);
1034 q2s16 = vcombine_s16(d4s16, d5s16);
1035 q5s16 = vcombine_s16(d10s16, d11s16);
1038 q8s16 = vaddq_s16(q0s16, q3s16);
1039 q9s16 = vaddq_s16(q1s16, q2s16);
1040 q10s16 = vsubq_s16(q1s16, q2s16);
1041 q11s16 = vsubq_s16(q0s16, q3s16);
1042 q12s16 = vsubq_s16(q7s16, q4s16);
1043 q13s16 = vsubq_s16(q6s16, q5s16);
1044 q14s16 = vaddq_s16(q6s16, q5s16);
1045 q15s16 = vaddq_s16(q7s16, q4s16);
1048 d20s16 = vget_low_s16(q10s16);
1049 d21s16 = vget_high_s16(q10s16);
1050 d22s16 = vget_low_s16(q11s16);
1051 d23s16 = vget_high_s16(q11s16);
1052 d24s16 = vget_low_s16(q12s16);
1053 d25s16 = vget_high_s16(q12s16);
1054 d26s16 = vget_low_s16(q13s16);
1055 d27s16 = vget_high_s16(q13s16);
1057 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
1058 q3s32 = vmull_s16(d26s16, d14s16);
1059 q4s32 = vmull_s16(d27s16, d14s16);
1060 q0s32 = vmull_s16(d20s16, d14s16);
1061 q1s32 = vmull_s16(d21s16, d14s16);
1063 q5s32 = vsubq_s32(q3s32, q0s32);
1064 q6s32 = vsubq_s32(q4s32, q1s32);
1065 q0s32 = vaddq_s32(q3s32, q0s32);
1066 q4s32 = vaddq_s32(q4s32, q1s32);
1068 d4s16 = vqrshrn_n_s32(q5s32, 14);
1069 d5s16 = vqrshrn_n_s32(q6s32, 14);
1070 d10s16 = vqrshrn_n_s32(q0s32, 14);
1071 d11s16 = vqrshrn_n_s32(q4s32, 14);
1072 q2s16 = vcombine_s16(d4s16, d5s16);
1073 q5s16 = vcombine_s16(d10s16, d11s16);
1075 q0s32 = vmull_s16(d22s16, d14s16);
1076 q1s32 = vmull_s16(d23s16, d14s16);
1077 q13s32 = vmull_s16(d24s16, d14s16);
1078 q6s32 = vmull_s16(d25s16, d14s16);
1080 q10s32 = vsubq_s32(q13s32, q0s32);
1081 q4s32 = vsubq_s32(q6s32, q1s32);
1082 q13s32 = vaddq_s32(q13s32, q0s32);
1083 q6s32 = vaddq_s32(q6s32, q1s32);
1085 d6s16 = vqrshrn_n_s32(q10s32, 14);
1086 d7s16 = vqrshrn_n_s32(q4s32, 14);
1087 d8s16 = vqrshrn_n_s32(q13s32, 14);
1088 d9s16 = vqrshrn_n_s32(q6s32, 14);
1089 q3s16 = vcombine_s16(d6s16, d7s16);
1090 q4s16 = vcombine_s16(d8s16, d9s16);
1093 q0s16 = vld1q_s16(pass1_output);
1095 q1s16 = vld1q_s16(pass1_output);
1097 q12s16 = vaddq_s16(q0s16, q15s16);
1098 q13s16 = vaddq_s16(q1s16, q14s16);
1099 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1100 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1101 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1102 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1103 vst1_u64((uint64_t *)out, d24u64);
1105 vst1_u64((uint64_t *)out, d25u64);
1107 vst1_u64((uint64_t *)out, d26u64);
1109 vst1_u64((uint64_t *)out, d27u64);
1111 q14s16 = vsubq_s16(q1s16, q14s16);
1112 q15s16 = vsubq_s16(q0s16, q15s16);
1114 q10s16 = vld1q_s16(pass1_output);
1116 q11s16 = vld1q_s16(pass1_output);
1118 q12s16 = vaddq_s16(q10s16, q5s16);
1119 q13s16 = vaddq_s16(q11s16, q4s16);
1120 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1121 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1122 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1123 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1124 vst1_u64((uint64_t *)out, d24u64);
1126 vst1_u64((uint64_t *)out, d25u64);
1128 vst1_u64((uint64_t *)out, d26u64);
1130 vst1_u64((uint64_t *)out, d27u64);
1132 q4s16 = vsubq_s16(q11s16, q4s16);
1133 q5s16 = vsubq_s16(q10s16, q5s16);
1135 q0s16 = vld1q_s16(pass1_output);
1137 q1s16 = vld1q_s16(pass1_output);
1139 q12s16 = vaddq_s16(q0s16, q3s16);
1140 q13s16 = vaddq_s16(q1s16, q2s16);
1141 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1142 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1143 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1144 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1145 vst1_u64((uint64_t *)out, d24u64);
1147 vst1_u64((uint64_t *)out, d25u64);
1149 vst1_u64((uint64_t *)out, d26u64);
1151 vst1_u64((uint64_t *)out, d27u64);
1153 q2s16 = vsubq_s16(q1s16, q2s16);
1154 q3s16 = vsubq_s16(q0s16, q3s16);
1156 q10s16 = vld1q_s16(pass1_output);
1158 q11s16 = vld1q_s16(pass1_output);
1159 q12s16 = vaddq_s16(q10s16, q9s16);
1160 q13s16 = vaddq_s16(q11s16, q8s16);
1161 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1162 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1163 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1164 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1165 vst1_u64((uint64_t *)out, d24u64);
1167 vst1_u64((uint64_t *)out, d25u64);
1169 vst1_u64((uint64_t *)out, d26u64);
1171 vst1_u64((uint64_t *)out, d27u64);
1173 q8s16 = vsubq_s16(q11s16, q8s16);
1174 q9s16 = vsubq_s16(q10s16, q9s16);
1176 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
1177 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
1178 d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
1179 d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
1180 d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
1181 d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
1182 d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1183 d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1184 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1185 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1186 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1187 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1188 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1189 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1190 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1191 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1193 vst1_u64((uint64_t *)out, d16u64);
1195 vst1_u64((uint64_t *)out, d17u64);
1197 vst1_u64((uint64_t *)out, d18u64);
1199 vst1_u64((uint64_t *)out, d19u64);
1201 vst1_u64((uint64_t *)out, d4u64);
1203 vst1_u64((uint64_t *)out, d5u64);
1205 vst1_u64((uint64_t *)out, d6u64);
1207 vst1_u64((uint64_t *)out, d7u64);
1209 vst1_u64((uint64_t *)out, d8u64);
1211 vst1_u64((uint64_t *)out, d9u64);
1213 vst1_u64((uint64_t *)out, d10u64);
1215 vst1_u64((uint64_t *)out, d11u64);
1217 vst1_u64((uint64_t *)out, d28u64);
1219 vst1_u64((uint64_t *)out, d29u64);
1221 vst1_u64((uint64_t *)out, d30u64);
1223 vst1_u64((uint64_t *)out, d31u64);