2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "vpx_dsp/arm/idct_neon.h"
14 #include "vpx_dsp/txfm_common.h"
16 void vpx_idct16x16_256_add_neon_pass1(const int16_t *in, int16_t *out) {
17 int16x4_t d0s16, d1s16, d2s16, d3s16;
18 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
19 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
20 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
21 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
22 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
23 int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
24 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
27 q0x2s16 = vld2q_s16(in);
28 q8s16 = q0x2s16.val[0];
30 q0x2s16 = vld2q_s16(in);
31 q9s16 = q0x2s16.val[0];
33 q0x2s16 = vld2q_s16(in);
34 q10s16 = q0x2s16.val[0];
36 q0x2s16 = vld2q_s16(in);
37 q11s16 = q0x2s16.val[0];
39 q0x2s16 = vld2q_s16(in);
40 q12s16 = q0x2s16.val[0];
42 q0x2s16 = vld2q_s16(in);
43 q13s16 = q0x2s16.val[0];
45 q0x2s16 = vld2q_s16(in);
46 q14s16 = q0x2s16.val[0];
48 q0x2s16 = vld2q_s16(in);
49 q15s16 = q0x2s16.val[0];
51 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
54 d16s16 = vget_low_s16(q8s16);
55 d17s16 = vget_high_s16(q8s16);
56 d18s16 = vget_low_s16(q9s16);
57 d19s16 = vget_high_s16(q9s16);
58 d20s16 = vget_low_s16(q10s16);
59 d21s16 = vget_high_s16(q10s16);
60 d22s16 = vget_low_s16(q11s16);
61 d23s16 = vget_high_s16(q11s16);
62 d24s16 = vget_low_s16(q12s16);
63 d25s16 = vget_high_s16(q12s16);
64 d26s16 = vget_low_s16(q13s16);
65 d27s16 = vget_high_s16(q13s16);
66 d28s16 = vget_low_s16(q14s16);
67 d29s16 = vget_high_s16(q14s16);
68 d30s16 = vget_low_s16(q15s16);
69 d31s16 = vget_high_s16(q15s16);
72 d0s16 = vdup_n_s16((int16_t)cospi_28_64);
73 d1s16 = vdup_n_s16((int16_t)cospi_4_64);
75 q2s32 = vmull_s16(d18s16, d0s16);
76 q3s32 = vmull_s16(d19s16, d0s16);
77 q5s32 = vmull_s16(d18s16, d1s16);
78 q6s32 = vmull_s16(d19s16, d1s16);
80 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
81 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
82 q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
83 q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
85 d2s16 = vdup_n_s16((int16_t)cospi_12_64);
86 d3s16 = vdup_n_s16((int16_t)cospi_20_64);
88 d8s16 = vrshrn_n_s32(q2s32, 14);
89 d9s16 = vrshrn_n_s32(q3s32, 14);
90 d14s16 = vrshrn_n_s32(q5s32, 14);
91 d15s16 = vrshrn_n_s32(q6s32, 14);
92 q4s16 = vcombine_s16(d8s16, d9s16);
93 q7s16 = vcombine_s16(d14s16, d15s16);
95 q2s32 = vmull_s16(d26s16, d2s16);
96 q3s32 = vmull_s16(d27s16, d2s16);
97 q9s32 = vmull_s16(d26s16, d3s16);
98 q15s32 = vmull_s16(d27s16, d3s16);
100 q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
101 q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
102 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
103 q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
105 d10s16 = vrshrn_n_s32(q2s32, 14);
106 d11s16 = vrshrn_n_s32(q3s32, 14);
107 d12s16 = vrshrn_n_s32(q9s32, 14);
108 d13s16 = vrshrn_n_s32(q15s32, 14);
109 q5s16 = vcombine_s16(d10s16, d11s16);
110 q6s16 = vcombine_s16(d12s16, d13s16);
113 d30s16 = vdup_n_s16((int16_t)cospi_16_64);
115 q2s32 = vmull_s16(d16s16, d30s16);
116 q11s32 = vmull_s16(d17s16, d30s16);
117 q0s32 = vmull_s16(d24s16, d30s16);
118 q1s32 = vmull_s16(d25s16, d30s16);
120 d30s16 = vdup_n_s16((int16_t)cospi_24_64);
121 d31s16 = vdup_n_s16((int16_t)cospi_8_64);
123 q3s32 = vaddq_s32(q2s32, q0s32);
124 q12s32 = vaddq_s32(q11s32, q1s32);
125 q13s32 = vsubq_s32(q2s32, q0s32);
126 q1s32 = vsubq_s32(q11s32, q1s32);
128 d16s16 = vrshrn_n_s32(q3s32, 14);
129 d17s16 = vrshrn_n_s32(q12s32, 14);
130 d18s16 = vrshrn_n_s32(q13s32, 14);
131 d19s16 = vrshrn_n_s32(q1s32, 14);
132 q8s16 = vcombine_s16(d16s16, d17s16);
133 q9s16 = vcombine_s16(d18s16, d19s16);
135 q0s32 = vmull_s16(d20s16, d31s16);
136 q1s32 = vmull_s16(d21s16, d31s16);
137 q12s32 = vmull_s16(d20s16, d30s16);
138 q13s32 = vmull_s16(d21s16, d30s16);
140 q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
141 q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
142 q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
143 q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
145 d22s16 = vrshrn_n_s32(q0s32, 14);
146 d23s16 = vrshrn_n_s32(q1s32, 14);
147 d20s16 = vrshrn_n_s32(q12s32, 14);
148 d21s16 = vrshrn_n_s32(q13s32, 14);
149 q10s16 = vcombine_s16(d20s16, d21s16);
150 q11s16 = vcombine_s16(d22s16, d23s16);
152 q13s16 = vsubq_s16(q4s16, q5s16);
153 q4s16 = vaddq_s16(q4s16, q5s16);
154 q14s16 = vsubq_s16(q7s16, q6s16);
155 q15s16 = vaddq_s16(q6s16, q7s16);
156 d26s16 = vget_low_s16(q13s16);
157 d27s16 = vget_high_s16(q13s16);
158 d28s16 = vget_low_s16(q14s16);
159 d29s16 = vget_high_s16(q14s16);
162 q0s16 = vaddq_s16(q8s16, q11s16);
163 q1s16 = vaddq_s16(q9s16, q10s16);
164 q2s16 = vsubq_s16(q9s16, q10s16);
165 q3s16 = vsubq_s16(q8s16, q11s16);
167 d16s16 = vdup_n_s16((int16_t)cospi_16_64);
169 q11s32 = vmull_s16(d26s16, d16s16);
170 q12s32 = vmull_s16(d27s16, d16s16);
171 q9s32 = vmull_s16(d28s16, d16s16);
172 q10s32 = vmull_s16(d29s16, d16s16);
174 q6s32 = vsubq_s32(q9s32, q11s32);
175 q13s32 = vsubq_s32(q10s32, q12s32);
176 q9s32 = vaddq_s32(q9s32, q11s32);
177 q10s32 = vaddq_s32(q10s32, q12s32);
179 d10s16 = vrshrn_n_s32(q6s32, 14);
180 d11s16 = vrshrn_n_s32(q13s32, 14);
181 d12s16 = vrshrn_n_s32(q9s32, 14);
182 d13s16 = vrshrn_n_s32(q10s32, 14);
183 q5s16 = vcombine_s16(d10s16, d11s16);
184 q6s16 = vcombine_s16(d12s16, d13s16);
187 q8s16 = vaddq_s16(q0s16, q15s16);
188 q9s16 = vaddq_s16(q1s16, q6s16);
189 q10s16 = vaddq_s16(q2s16, q5s16);
190 q11s16 = vaddq_s16(q3s16, q4s16);
191 q12s16 = vsubq_s16(q3s16, q4s16);
192 q13s16 = vsubq_s16(q2s16, q5s16);
193 q14s16 = vsubq_s16(q1s16, q6s16);
194 q15s16 = vsubq_s16(q0s16, q15s16);
197 vst1q_s16(out, q8s16);
199 vst1q_s16(out, q9s16);
201 vst1q_s16(out, q10s16);
203 vst1q_s16(out, q11s16);
205 vst1q_s16(out, q12s16);
207 vst1q_s16(out, q13s16);
209 vst1q_s16(out, q14s16);
211 vst1q_s16(out, q15s16);
214 void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *out,
215 int16_t *pass1_output,
216 int16_t skip_adding, uint8_t *dest,
219 uint8x8_t d12u8, d13u8;
220 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
221 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
222 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
223 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
224 uint64x1_t d24u64, d25u64, d26u64, d27u64;
225 int64x1_t d12s64, d13s64;
226 uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
227 uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
228 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
229 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
230 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
231 int32x4_t q10s32, q11s32, q12s32, q13s32;
234 q0x2s16 = vld2q_s16(src);
235 q8s16 = q0x2s16.val[0];
237 q0x2s16 = vld2q_s16(src);
238 q9s16 = q0x2s16.val[0];
240 q0x2s16 = vld2q_s16(src);
241 q10s16 = q0x2s16.val[0];
243 q0x2s16 = vld2q_s16(src);
244 q11s16 = q0x2s16.val[0];
246 q0x2s16 = vld2q_s16(src);
247 q12s16 = q0x2s16.val[0];
249 q0x2s16 = vld2q_s16(src);
250 q13s16 = q0x2s16.val[0];
252 q0x2s16 = vld2q_s16(src);
253 q14s16 = q0x2s16.val[0];
255 q0x2s16 = vld2q_s16(src);
256 q15s16 = q0x2s16.val[0];
258 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
261 d16s16 = vget_low_s16(q8s16);
262 d17s16 = vget_high_s16(q8s16);
263 d18s16 = vget_low_s16(q9s16);
264 d19s16 = vget_high_s16(q9s16);
265 d20s16 = vget_low_s16(q10s16);
266 d21s16 = vget_high_s16(q10s16);
267 d22s16 = vget_low_s16(q11s16);
268 d23s16 = vget_high_s16(q11s16);
269 d24s16 = vget_low_s16(q12s16);
270 d25s16 = vget_high_s16(q12s16);
271 d26s16 = vget_low_s16(q13s16);
272 d27s16 = vget_high_s16(q13s16);
273 d28s16 = vget_low_s16(q14s16);
274 d29s16 = vget_high_s16(q14s16);
275 d30s16 = vget_low_s16(q15s16);
276 d31s16 = vget_high_s16(q15s16);
279 d12s16 = vdup_n_s16((int16_t)cospi_30_64);
280 d13s16 = vdup_n_s16((int16_t)cospi_2_64);
282 q2s32 = vmull_s16(d16s16, d12s16);
283 q3s32 = vmull_s16(d17s16, d12s16);
284 q1s32 = vmull_s16(d16s16, d13s16);
285 q4s32 = vmull_s16(d17s16, d13s16);
287 q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
288 q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
289 q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
290 q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
292 d0s16 = vrshrn_n_s32(q2s32, 14);
293 d1s16 = vrshrn_n_s32(q3s32, 14);
294 d14s16 = vrshrn_n_s32(q1s32, 14);
295 d15s16 = vrshrn_n_s32(q4s32, 14);
296 q0s16 = vcombine_s16(d0s16, d1s16);
297 q7s16 = vcombine_s16(d14s16, d15s16);
299 d30s16 = vdup_n_s16((int16_t)cospi_14_64);
300 d31s16 = vdup_n_s16((int16_t)cospi_18_64);
302 q2s32 = vmull_s16(d24s16, d30s16);
303 q3s32 = vmull_s16(d25s16, d30s16);
304 q4s32 = vmull_s16(d24s16, d31s16);
305 q5s32 = vmull_s16(d25s16, d31s16);
307 q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
308 q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
309 q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
310 q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
312 d2s16 = vrshrn_n_s32(q2s32, 14);
313 d3s16 = vrshrn_n_s32(q3s32, 14);
314 d12s16 = vrshrn_n_s32(q4s32, 14);
315 d13s16 = vrshrn_n_s32(q5s32, 14);
316 q1s16 = vcombine_s16(d2s16, d3s16);
317 q6s16 = vcombine_s16(d12s16, d13s16);
319 d30s16 = vdup_n_s16((int16_t)cospi_22_64);
320 d31s16 = vdup_n_s16((int16_t)cospi_10_64);
322 q11s32 = vmull_s16(d20s16, d30s16);
323 q12s32 = vmull_s16(d21s16, d30s16);
324 q4s32 = vmull_s16(d20s16, d31s16);
325 q5s32 = vmull_s16(d21s16, d31s16);
327 q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
328 q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
329 q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
330 q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
332 d4s16 = vrshrn_n_s32(q11s32, 14);
333 d5s16 = vrshrn_n_s32(q12s32, 14);
334 d11s16 = vrshrn_n_s32(q5s32, 14);
335 d10s16 = vrshrn_n_s32(q4s32, 14);
336 q2s16 = vcombine_s16(d4s16, d5s16);
337 q5s16 = vcombine_s16(d10s16, d11s16);
339 d30s16 = vdup_n_s16((int16_t)cospi_6_64);
340 d31s16 = vdup_n_s16((int16_t)cospi_26_64);
342 q10s32 = vmull_s16(d28s16, d30s16);
343 q11s32 = vmull_s16(d29s16, d30s16);
344 q12s32 = vmull_s16(d28s16, d31s16);
345 q13s32 = vmull_s16(d29s16, d31s16);
347 q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
348 q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
349 q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
350 q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
352 d6s16 = vrshrn_n_s32(q10s32, 14);
353 d7s16 = vrshrn_n_s32(q11s32, 14);
354 d8s16 = vrshrn_n_s32(q12s32, 14);
355 d9s16 = vrshrn_n_s32(q13s32, 14);
356 q3s16 = vcombine_s16(d6s16, d7s16);
357 q4s16 = vcombine_s16(d8s16, d9s16);
360 q9s16 = vsubq_s16(q0s16, q1s16);
361 q0s16 = vaddq_s16(q0s16, q1s16);
362 q10s16 = vsubq_s16(q3s16, q2s16);
363 q11s16 = vaddq_s16(q2s16, q3s16);
364 q12s16 = vaddq_s16(q4s16, q5s16);
365 q13s16 = vsubq_s16(q4s16, q5s16);
366 q14s16 = vsubq_s16(q7s16, q6s16);
367 q7s16 = vaddq_s16(q6s16, q7s16);
370 d18s16 = vget_low_s16(q9s16);
371 d19s16 = vget_high_s16(q9s16);
372 d20s16 = vget_low_s16(q10s16);
373 d21s16 = vget_high_s16(q10s16);
374 d26s16 = vget_low_s16(q13s16);
375 d27s16 = vget_high_s16(q13s16);
376 d28s16 = vget_low_s16(q14s16);
377 d29s16 = vget_high_s16(q14s16);
379 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
380 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
382 q2s32 = vmull_s16(d18s16, d31s16);
383 q3s32 = vmull_s16(d19s16, d31s16);
384 q4s32 = vmull_s16(d28s16, d31s16);
385 q5s32 = vmull_s16(d29s16, d31s16);
387 q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
388 q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
389 q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
390 q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
392 d12s16 = vrshrn_n_s32(q2s32, 14);
393 d13s16 = vrshrn_n_s32(q3s32, 14);
394 d2s16 = vrshrn_n_s32(q4s32, 14);
395 d3s16 = vrshrn_n_s32(q5s32, 14);
396 q1s16 = vcombine_s16(d2s16, d3s16);
397 q6s16 = vcombine_s16(d12s16, d13s16);
402 d30s16 = vdup_n_s16(-cospi_8_64);
403 q11s32 = vmull_s16(d26s16, d30s16);
404 q12s32 = vmull_s16(d27s16, d30s16);
405 q8s32 = vmull_s16(d20s16, d30s16);
406 q9s32 = vmull_s16(d21s16, d30s16);
408 q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
409 q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
410 q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
411 q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
413 d4s16 = vrshrn_n_s32(q11s32, 14);
414 d5s16 = vrshrn_n_s32(q12s32, 14);
415 d10s16 = vrshrn_n_s32(q8s32, 14);
416 d11s16 = vrshrn_n_s32(q9s32, 14);
417 q2s16 = vcombine_s16(d4s16, d5s16);
418 q5s16 = vcombine_s16(d10s16, d11s16);
421 q8s16 = vaddq_s16(q0s16, q3s16);
422 q9s16 = vaddq_s16(q1s16, q2s16);
423 q10s16 = vsubq_s16(q1s16, q2s16);
424 q11s16 = vsubq_s16(q0s16, q3s16);
425 q12s16 = vsubq_s16(q7s16, q4s16);
426 q13s16 = vsubq_s16(q6s16, q5s16);
427 q14s16 = vaddq_s16(q6s16, q5s16);
428 q15s16 = vaddq_s16(q7s16, q4s16);
431 d20s16 = vget_low_s16(q10s16);
432 d21s16 = vget_high_s16(q10s16);
433 d22s16 = vget_low_s16(q11s16);
434 d23s16 = vget_high_s16(q11s16);
435 d24s16 = vget_low_s16(q12s16);
436 d25s16 = vget_high_s16(q12s16);
437 d26s16 = vget_low_s16(q13s16);
438 d27s16 = vget_high_s16(q13s16);
440 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
442 q3s32 = vmull_s16(d26s16, d14s16);
443 q4s32 = vmull_s16(d27s16, d14s16);
444 q0s32 = vmull_s16(d20s16, d14s16);
445 q1s32 = vmull_s16(d21s16, d14s16);
447 q5s32 = vsubq_s32(q3s32, q0s32);
448 q6s32 = vsubq_s32(q4s32, q1s32);
449 q10s32 = vaddq_s32(q3s32, q0s32);
450 q4s32 = vaddq_s32(q4s32, q1s32);
452 d4s16 = vrshrn_n_s32(q5s32, 14);
453 d5s16 = vrshrn_n_s32(q6s32, 14);
454 d10s16 = vrshrn_n_s32(q10s32, 14);
455 d11s16 = vrshrn_n_s32(q4s32, 14);
456 q2s16 = vcombine_s16(d4s16, d5s16);
457 q5s16 = vcombine_s16(d10s16, d11s16);
459 q0s32 = vmull_s16(d22s16, d14s16);
460 q1s32 = vmull_s16(d23s16, d14s16);
461 q13s32 = vmull_s16(d24s16, d14s16);
462 q6s32 = vmull_s16(d25s16, d14s16);
464 q10s32 = vsubq_s32(q13s32, q0s32);
465 q4s32 = vsubq_s32(q6s32, q1s32);
466 q13s32 = vaddq_s32(q13s32, q0s32);
467 q6s32 = vaddq_s32(q6s32, q1s32);
469 d6s16 = vrshrn_n_s32(q10s32, 14);
470 d7s16 = vrshrn_n_s32(q4s32, 14);
471 d8s16 = vrshrn_n_s32(q13s32, 14);
472 d9s16 = vrshrn_n_s32(q6s32, 14);
473 q3s16 = vcombine_s16(d6s16, d7s16);
474 q4s16 = vcombine_s16(d8s16, d9s16);
477 if (skip_adding != 0) {
479 // load the data in pass1
480 q0s16 = vld1q_s16(pass1_output);
482 q1s16 = vld1q_s16(pass1_output);
484 d12s64 = vld1_s64((int64_t *)dest);
486 d13s64 = vld1_s64((int64_t *)dest);
489 q12s16 = vaddq_s16(q0s16, q15s16);
490 q13s16 = vaddq_s16(q1s16, q14s16);
491 q12s16 = vrshrq_n_s16(q12s16, 6);
492 q13s16 = vrshrq_n_s16(q13s16, 6);
494 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
496 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
497 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
498 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
499 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
501 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
503 q14s16 = vsubq_s16(q1s16, q14s16);
504 q15s16 = vsubq_s16(q0s16, q15s16);
506 q10s16 = vld1q_s16(pass1_output);
508 q11s16 = vld1q_s16(pass1_output);
510 d12s64 = vld1_s64((int64_t *)dest);
512 d13s64 = vld1_s64((int64_t *)dest);
514 q12s16 = vaddq_s16(q10s16, q5s16);
515 q13s16 = vaddq_s16(q11s16, q4s16);
516 q12s16 = vrshrq_n_s16(q12s16, 6);
517 q13s16 = vrshrq_n_s16(q13s16, 6);
519 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
521 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
522 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
523 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
524 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
526 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
528 q4s16 = vsubq_s16(q11s16, q4s16);
529 q5s16 = vsubq_s16(q10s16, q5s16);
531 q0s16 = vld1q_s16(pass1_output);
533 q1s16 = vld1q_s16(pass1_output);
535 d12s64 = vld1_s64((int64_t *)dest);
537 d13s64 = vld1_s64((int64_t *)dest);
539 q12s16 = vaddq_s16(q0s16, q3s16);
540 q13s16 = vaddq_s16(q1s16, q2s16);
541 q12s16 = vrshrq_n_s16(q12s16, 6);
542 q13s16 = vrshrq_n_s16(q13s16, 6);
544 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
546 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
547 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
548 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
549 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
551 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
553 q2s16 = vsubq_s16(q1s16, q2s16);
554 q3s16 = vsubq_s16(q0s16, q3s16);
556 q10s16 = vld1q_s16(pass1_output);
558 q11s16 = vld1q_s16(pass1_output);
559 d12s64 = vld1_s64((int64_t *)dest);
561 d13s64 = vld1_s64((int64_t *)dest);
563 q12s16 = vaddq_s16(q10s16, q9s16);
564 q13s16 = vaddq_s16(q11s16, q8s16);
565 q12s16 = vrshrq_n_s16(q12s16, 6);
566 q13s16 = vrshrq_n_s16(q13s16, 6);
568 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
570 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
571 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
572 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
573 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
575 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
577 q8s16 = vsubq_s16(q11s16, q8s16);
578 q9s16 = vsubq_s16(q10s16, q9s16);
580 // store the data out 8,9,10,11,12,13,14,15
581 d12s64 = vld1_s64((int64_t *)dest);
583 q8s16 = vrshrq_n_s16(q8s16, 6);
584 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
585 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
586 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
589 d12s64 = vld1_s64((int64_t *)dest);
591 q9s16 = vrshrq_n_s16(q9s16, 6);
592 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
593 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
594 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
597 d12s64 = vld1_s64((int64_t *)dest);
599 q2s16 = vrshrq_n_s16(q2s16, 6);
600 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
601 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
602 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
605 d12s64 = vld1_s64((int64_t *)dest);
607 q3s16 = vrshrq_n_s16(q3s16, 6);
608 q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
609 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
610 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
613 d12s64 = vld1_s64((int64_t *)dest);
615 q4s16 = vrshrq_n_s16(q4s16, 6);
616 q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
617 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
618 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
621 d12s64 = vld1_s64((int64_t *)dest);
623 q5s16 = vrshrq_n_s16(q5s16, 6);
624 q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
625 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
626 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
629 d12s64 = vld1_s64((int64_t *)dest);
631 q14s16 = vrshrq_n_s16(q14s16, 6);
633 vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
634 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
635 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
638 d12s64 = vld1_s64((int64_t *)dest);
639 q15s16 = vrshrq_n_s16(q15s16, 6);
641 vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
642 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
643 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
644 } else { // skip_adding_dest
645 q0s16 = vld1q_s16(pass1_output);
647 q1s16 = vld1q_s16(pass1_output);
649 q12s16 = vaddq_s16(q0s16, q15s16);
650 q13s16 = vaddq_s16(q1s16, q14s16);
651 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
652 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
653 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
654 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
655 vst1_u64((uint64_t *)out, d24u64);
657 vst1_u64((uint64_t *)out, d25u64);
659 vst1_u64((uint64_t *)out, d26u64);
661 vst1_u64((uint64_t *)out, d27u64);
663 q14s16 = vsubq_s16(q1s16, q14s16);
664 q15s16 = vsubq_s16(q0s16, q15s16);
666 q10s16 = vld1q_s16(pass1_output);
668 q11s16 = vld1q_s16(pass1_output);
670 q12s16 = vaddq_s16(q10s16, q5s16);
671 q13s16 = vaddq_s16(q11s16, q4s16);
672 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
673 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
674 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
675 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
676 vst1_u64((uint64_t *)out, d24u64);
678 vst1_u64((uint64_t *)out, d25u64);
680 vst1_u64((uint64_t *)out, d26u64);
682 vst1_u64((uint64_t *)out, d27u64);
684 q4s16 = vsubq_s16(q11s16, q4s16);
685 q5s16 = vsubq_s16(q10s16, q5s16);
687 q0s16 = vld1q_s16(pass1_output);
689 q1s16 = vld1q_s16(pass1_output);
691 q12s16 = vaddq_s16(q0s16, q3s16);
692 q13s16 = vaddq_s16(q1s16, q2s16);
693 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
694 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
695 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
696 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
697 vst1_u64((uint64_t *)out, d24u64);
699 vst1_u64((uint64_t *)out, d25u64);
701 vst1_u64((uint64_t *)out, d26u64);
703 vst1_u64((uint64_t *)out, d27u64);
705 q2s16 = vsubq_s16(q1s16, q2s16);
706 q3s16 = vsubq_s16(q0s16, q3s16);
708 q10s16 = vld1q_s16(pass1_output);
710 q11s16 = vld1q_s16(pass1_output);
712 q12s16 = vaddq_s16(q10s16, q9s16);
713 q13s16 = vaddq_s16(q11s16, q8s16);
714 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
715 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
716 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
717 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
718 vst1_u64((uint64_t *)out, d24u64);
720 vst1_u64((uint64_t *)out, d25u64);
722 vst1_u64((uint64_t *)out, d26u64);
724 vst1_u64((uint64_t *)out, d27u64);
726 q8s16 = vsubq_s16(q11s16, q8s16);
727 q9s16 = vsubq_s16(q10s16, q9s16);
729 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
731 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
733 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
735 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
737 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
739 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
741 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
743 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
745 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
747 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
749 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
751 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
753 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
755 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
757 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
759 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
763 void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *in, int16_t *out) {
765 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
766 int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
767 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
768 int32x4_t q6s32, q9s32;
769 int32x4_t q10s32, q11s32, q12s32, q15s32;
772 q0x2s16 = load_tran_low_to_s16x2q(in);
773 q8s16 = q0x2s16.val[0];
775 q0x2s16 = load_tran_low_to_s16x2q(in);
776 q9s16 = q0x2s16.val[0];
778 q0x2s16 = load_tran_low_to_s16x2q(in);
779 q10s16 = q0x2s16.val[0];
781 q0x2s16 = load_tran_low_to_s16x2q(in);
782 q11s16 = q0x2s16.val[0];
784 q0x2s16 = load_tran_low_to_s16x2q(in);
785 q12s16 = q0x2s16.val[0];
787 q0x2s16 = load_tran_low_to_s16x2q(in);
788 q13s16 = q0x2s16.val[0];
790 q0x2s16 = load_tran_low_to_s16x2q(in);
791 q14s16 = q0x2s16.val[0];
793 q0x2s16 = load_tran_low_to_s16x2q(in);
794 q15s16 = q0x2s16.val[0];
796 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
800 q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
801 q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
803 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
804 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
807 q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
808 d4s16 = vdup_n_s16((int16_t)cospi_16_64);
810 q8s16 = vqrdmulhq_s16(q8s16, q1s16);
812 d8s16 = vget_low_s16(q4s16);
813 d9s16 = vget_high_s16(q4s16);
814 d14s16 = vget_low_s16(q7s16);
815 d15s16 = vget_high_s16(q7s16);
816 q9s32 = vmull_s16(d14s16, d4s16);
817 q10s32 = vmull_s16(d15s16, d4s16);
818 q12s32 = vmull_s16(d9s16, d4s16);
819 q11s32 = vmull_s16(d8s16, d4s16);
821 q15s32 = vsubq_s32(q10s32, q12s32);
822 q6s32 = vsubq_s32(q9s32, q11s32);
823 q9s32 = vaddq_s32(q9s32, q11s32);
824 q10s32 = vaddq_s32(q10s32, q12s32);
826 d11s16 = vrshrn_n_s32(q15s32, 14);
827 d10s16 = vrshrn_n_s32(q6s32, 14);
828 d12s16 = vrshrn_n_s32(q9s32, 14);
829 d13s16 = vrshrn_n_s32(q10s32, 14);
830 q5s16 = vcombine_s16(d10s16, d11s16);
831 q6s16 = vcombine_s16(d12s16, d13s16);
834 q2s16 = vaddq_s16(q8s16, q7s16);
835 q9s16 = vaddq_s16(q8s16, q6s16);
836 q10s16 = vaddq_s16(q8s16, q5s16);
837 q11s16 = vaddq_s16(q8s16, q4s16);
838 q12s16 = vsubq_s16(q8s16, q4s16);
839 q13s16 = vsubq_s16(q8s16, q5s16);
840 q14s16 = vsubq_s16(q8s16, q6s16);
841 q15s16 = vsubq_s16(q8s16, q7s16);
844 vst1q_s16(out, q2s16);
846 vst1q_s16(out, q9s16);
848 vst1q_s16(out, q10s16);
850 vst1q_s16(out, q11s16);
852 vst1q_s16(out, q12s16);
854 vst1q_s16(out, q13s16);
856 vst1q_s16(out, q14s16);
858 vst1q_s16(out, q15s16);
861 void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *out,
862 int16_t *pass1_output) {
863 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
864 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
865 int16x4_t d20s16, d21s16, d22s16, d23s16;
866 int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
867 uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
868 uint64x1_t d16u64, d17u64, d18u64, d19u64;
869 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
870 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
871 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
872 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
873 int32x4_t q10s32, q11s32, q12s32, q13s32;
876 q0x2s16 = load_tran_low_to_s16x2q(src);
877 q8s16 = q0x2s16.val[0];
879 q0x2s16 = load_tran_low_to_s16x2q(src);
880 q9s16 = q0x2s16.val[0];
882 q0x2s16 = load_tran_low_to_s16x2q(src);
883 q10s16 = q0x2s16.val[0];
885 q0x2s16 = load_tran_low_to_s16x2q(src);
886 q11s16 = q0x2s16.val[0];
888 q0x2s16 = load_tran_low_to_s16x2q(src);
889 q12s16 = q0x2s16.val[0];
891 q0x2s16 = load_tran_low_to_s16x2q(src);
892 q13s16 = q0x2s16.val[0];
894 q0x2s16 = load_tran_low_to_s16x2q(src);
895 q14s16 = q0x2s16.val[0];
897 q0x2s16 = load_tran_low_to_s16x2q(src);
898 q15s16 = q0x2s16.val[0];
900 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
904 q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
905 q0s16 = vqrdmulhq_s16(q8s16, q6s16);
906 q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
907 q7s16 = vqrdmulhq_s16(q8s16, q6s16);
909 q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
910 q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
911 q3s16 = vqrdmulhq_s16(q9s16, q15s16);
912 q4s16 = vqrdmulhq_s16(q9s16, q14s16);
915 d0s16 = vget_low_s16(q0s16);
916 d1s16 = vget_high_s16(q0s16);
917 d6s16 = vget_low_s16(q3s16);
918 d7s16 = vget_high_s16(q3s16);
919 d8s16 = vget_low_s16(q4s16);
920 d9s16 = vget_high_s16(q4s16);
921 d14s16 = vget_low_s16(q7s16);
922 d15s16 = vget_high_s16(q7s16);
924 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
925 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
927 q12s32 = vmull_s16(d14s16, d31s16);
928 q5s32 = vmull_s16(d15s16, d31s16);
929 q2s32 = vmull_s16(d0s16, d31s16);
930 q11s32 = vmull_s16(d1s16, d31s16);
932 q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
933 q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
934 q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
935 q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
937 d2s16 = vrshrn_n_s32(q12s32, 14);
938 d3s16 = vrshrn_n_s32(q5s32, 14);
939 d12s16 = vrshrn_n_s32(q2s32, 14);
940 d13s16 = vrshrn_n_s32(q11s32, 14);
941 q1s16 = vcombine_s16(d2s16, d3s16);
942 q6s16 = vcombine_s16(d12s16, d13s16);
944 d30s16 = vdup_n_s16(-cospi_8_64);
945 q10s32 = vmull_s16(d8s16, d30s16);
946 q13s32 = vmull_s16(d9s16, d30s16);
947 q8s32 = vmull_s16(d6s16, d30s16);
948 q9s32 = vmull_s16(d7s16, d30s16);
950 q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
951 q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
952 q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
953 q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
955 d4s16 = vrshrn_n_s32(q10s32, 14);
956 d5s16 = vrshrn_n_s32(q13s32, 14);
957 d10s16 = vrshrn_n_s32(q8s32, 14);
958 d11s16 = vrshrn_n_s32(q9s32, 14);
959 q2s16 = vcombine_s16(d4s16, d5s16);
960 q5s16 = vcombine_s16(d10s16, d11s16);
963 q8s16 = vaddq_s16(q0s16, q3s16);
964 q9s16 = vaddq_s16(q1s16, q2s16);
965 q10s16 = vsubq_s16(q1s16, q2s16);
966 q11s16 = vsubq_s16(q0s16, q3s16);
967 q12s16 = vsubq_s16(q7s16, q4s16);
968 q13s16 = vsubq_s16(q6s16, q5s16);
969 q14s16 = vaddq_s16(q6s16, q5s16);
970 q15s16 = vaddq_s16(q7s16, q4s16);
973 d20s16 = vget_low_s16(q10s16);
974 d21s16 = vget_high_s16(q10s16);
975 d22s16 = vget_low_s16(q11s16);
976 d23s16 = vget_high_s16(q11s16);
977 d24s16 = vget_low_s16(q12s16);
978 d25s16 = vget_high_s16(q12s16);
979 d26s16 = vget_low_s16(q13s16);
980 d27s16 = vget_high_s16(q13s16);
982 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
983 q3s32 = vmull_s16(d26s16, d14s16);
984 q4s32 = vmull_s16(d27s16, d14s16);
985 q0s32 = vmull_s16(d20s16, d14s16);
986 q1s32 = vmull_s16(d21s16, d14s16);
988 q5s32 = vsubq_s32(q3s32, q0s32);
989 q6s32 = vsubq_s32(q4s32, q1s32);
990 q0s32 = vaddq_s32(q3s32, q0s32);
991 q4s32 = vaddq_s32(q4s32, q1s32);
993 d4s16 = vrshrn_n_s32(q5s32, 14);
994 d5s16 = vrshrn_n_s32(q6s32, 14);
995 d10s16 = vrshrn_n_s32(q0s32, 14);
996 d11s16 = vrshrn_n_s32(q4s32, 14);
997 q2s16 = vcombine_s16(d4s16, d5s16);
998 q5s16 = vcombine_s16(d10s16, d11s16);
1000 q0s32 = vmull_s16(d22s16, d14s16);
1001 q1s32 = vmull_s16(d23s16, d14s16);
1002 q13s32 = vmull_s16(d24s16, d14s16);
1003 q6s32 = vmull_s16(d25s16, d14s16);
1005 q10s32 = vsubq_s32(q13s32, q0s32);
1006 q4s32 = vsubq_s32(q6s32, q1s32);
1007 q13s32 = vaddq_s32(q13s32, q0s32);
1008 q6s32 = vaddq_s32(q6s32, q1s32);
1010 d6s16 = vrshrn_n_s32(q10s32, 14);
1011 d7s16 = vrshrn_n_s32(q4s32, 14);
1012 d8s16 = vrshrn_n_s32(q13s32, 14);
1013 d9s16 = vrshrn_n_s32(q6s32, 14);
1014 q3s16 = vcombine_s16(d6s16, d7s16);
1015 q4s16 = vcombine_s16(d8s16, d9s16);
1018 q0s16 = vld1q_s16(pass1_output);
1020 q1s16 = vld1q_s16(pass1_output);
1022 q12s16 = vaddq_s16(q0s16, q15s16);
1023 q13s16 = vaddq_s16(q1s16, q14s16);
1024 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1025 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1026 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1027 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1028 vst1_u64((uint64_t *)out, d24u64);
1030 vst1_u64((uint64_t *)out, d25u64);
1032 vst1_u64((uint64_t *)out, d26u64);
1034 vst1_u64((uint64_t *)out, d27u64);
1036 q14s16 = vsubq_s16(q1s16, q14s16);
1037 q15s16 = vsubq_s16(q0s16, q15s16);
1039 q10s16 = vld1q_s16(pass1_output);
1041 q11s16 = vld1q_s16(pass1_output);
1043 q12s16 = vaddq_s16(q10s16, q5s16);
1044 q13s16 = vaddq_s16(q11s16, q4s16);
1045 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1046 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1047 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1048 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1049 vst1_u64((uint64_t *)out, d24u64);
1051 vst1_u64((uint64_t *)out, d25u64);
1053 vst1_u64((uint64_t *)out, d26u64);
1055 vst1_u64((uint64_t *)out, d27u64);
1057 q4s16 = vsubq_s16(q11s16, q4s16);
1058 q5s16 = vsubq_s16(q10s16, q5s16);
1060 q0s16 = vld1q_s16(pass1_output);
1062 q1s16 = vld1q_s16(pass1_output);
1064 q12s16 = vaddq_s16(q0s16, q3s16);
1065 q13s16 = vaddq_s16(q1s16, q2s16);
1066 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1067 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1068 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1069 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1070 vst1_u64((uint64_t *)out, d24u64);
1072 vst1_u64((uint64_t *)out, d25u64);
1074 vst1_u64((uint64_t *)out, d26u64);
1076 vst1_u64((uint64_t *)out, d27u64);
1078 q2s16 = vsubq_s16(q1s16, q2s16);
1079 q3s16 = vsubq_s16(q0s16, q3s16);
1081 q10s16 = vld1q_s16(pass1_output);
1083 q11s16 = vld1q_s16(pass1_output);
1084 q12s16 = vaddq_s16(q10s16, q9s16);
1085 q13s16 = vaddq_s16(q11s16, q8s16);
1086 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1087 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1088 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1089 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1090 vst1_u64((uint64_t *)out, d24u64);
1092 vst1_u64((uint64_t *)out, d25u64);
1094 vst1_u64((uint64_t *)out, d26u64);
1096 vst1_u64((uint64_t *)out, d27u64);
1098 q8s16 = vsubq_s16(q11s16, q8s16);
1099 q9s16 = vsubq_s16(q10s16, q9s16);
1101 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
1102 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
1103 d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
1104 d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
1105 d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
1106 d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
1107 d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1108 d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1109 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1110 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1111 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1112 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1113 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1114 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1115 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1116 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1118 vst1_u64((uint64_t *)out, d16u64);
1120 vst1_u64((uint64_t *)out, d17u64);
1122 vst1_u64((uint64_t *)out, d18u64);
1124 vst1_u64((uint64_t *)out, d19u64);
1126 vst1_u64((uint64_t *)out, d4u64);
1128 vst1_u64((uint64_t *)out, d5u64);
1130 vst1_u64((uint64_t *)out, d6u64);
1132 vst1_u64((uint64_t *)out, d7u64);
1134 vst1_u64((uint64_t *)out, d8u64);
1136 vst1_u64((uint64_t *)out, d9u64);
1138 vst1_u64((uint64_t *)out, d10u64);
1140 vst1_u64((uint64_t *)out, d11u64);
1142 vst1_u64((uint64_t *)out, d28u64);
1144 vst1_u64((uint64_t *)out, d29u64);
1146 vst1_u64((uint64_t *)out, d30u64);
1148 vst1_u64((uint64_t *)out, d31u64);