2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "vpx_dsp/arm/transpose_neon.h"
15 #include "vpx_dsp/txfm_common.h"
17 void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
19 int16x4_t d0s16, d1s16, d2s16, d3s16;
20 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
21 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
22 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
23 uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
24 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
25 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
26 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
27 int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
28 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
31 q0x2s16 = vld2q_s16(in);
32 q8s16 = q0x2s16.val[0];
34 q0x2s16 = vld2q_s16(in);
35 q9s16 = q0x2s16.val[0];
37 q0x2s16 = vld2q_s16(in);
38 q10s16 = q0x2s16.val[0];
40 q0x2s16 = vld2q_s16(in);
41 q11s16 = q0x2s16.val[0];
43 q0x2s16 = vld2q_s16(in);
44 q12s16 = q0x2s16.val[0];
46 q0x2s16 = vld2q_s16(in);
47 q13s16 = q0x2s16.val[0];
49 q0x2s16 = vld2q_s16(in);
50 q14s16 = q0x2s16.val[0];
52 q0x2s16 = vld2q_s16(in);
53 q15s16 = q0x2s16.val[0];
55 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
58 d16s16 = vget_low_s16(q8s16);
59 d17s16 = vget_high_s16(q8s16);
60 d18s16 = vget_low_s16(q9s16);
61 d19s16 = vget_high_s16(q9s16);
62 d20s16 = vget_low_s16(q10s16);
63 d21s16 = vget_high_s16(q10s16);
64 d22s16 = vget_low_s16(q11s16);
65 d23s16 = vget_high_s16(q11s16);
66 d24s16 = vget_low_s16(q12s16);
67 d25s16 = vget_high_s16(q12s16);
68 d26s16 = vget_low_s16(q13s16);
69 d27s16 = vget_high_s16(q13s16);
70 d28s16 = vget_low_s16(q14s16);
71 d29s16 = vget_high_s16(q14s16);
72 d30s16 = vget_low_s16(q15s16);
73 d31s16 = vget_high_s16(q15s16);
76 d0s16 = vdup_n_s16((int16_t)cospi_28_64);
77 d1s16 = vdup_n_s16((int16_t)cospi_4_64);
79 q2s32 = vmull_s16(d18s16, d0s16);
80 q3s32 = vmull_s16(d19s16, d0s16);
81 q5s32 = vmull_s16(d18s16, d1s16);
82 q6s32 = vmull_s16(d19s16, d1s16);
84 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
85 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
86 q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
87 q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
89 d2s16 = vdup_n_s16((int16_t)cospi_12_64);
90 d3s16 = vdup_n_s16((int16_t)cospi_20_64);
92 d8s16 = vqrshrn_n_s32(q2s32, 14);
93 d9s16 = vqrshrn_n_s32(q3s32, 14);
94 d14s16 = vqrshrn_n_s32(q5s32, 14);
95 d15s16 = vqrshrn_n_s32(q6s32, 14);
96 q4s16 = vcombine_s16(d8s16, d9s16);
97 q7s16 = vcombine_s16(d14s16, d15s16);
99 q2s32 = vmull_s16(d26s16, d2s16);
100 q3s32 = vmull_s16(d27s16, d2s16);
101 q9s32 = vmull_s16(d26s16, d3s16);
102 q15s32 = vmull_s16(d27s16, d3s16);
104 q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
105 q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
106 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
107 q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
109 d10s16 = vqrshrn_n_s32(q2s32, 14);
110 d11s16 = vqrshrn_n_s32(q3s32, 14);
111 d12s16 = vqrshrn_n_s32(q9s32, 14);
112 d13s16 = vqrshrn_n_s32(q15s32, 14);
113 q5s16 = vcombine_s16(d10s16, d11s16);
114 q6s16 = vcombine_s16(d12s16, d13s16);
117 d30s16 = vdup_n_s16((int16_t)cospi_16_64);
119 q2s32 = vmull_s16(d16s16, d30s16);
120 q11s32 = vmull_s16(d17s16, d30s16);
121 q0s32 = vmull_s16(d24s16, d30s16);
122 q1s32 = vmull_s16(d25s16, d30s16);
124 d30s16 = vdup_n_s16((int16_t)cospi_24_64);
125 d31s16 = vdup_n_s16((int16_t)cospi_8_64);
127 q3s32 = vaddq_s32(q2s32, q0s32);
128 q12s32 = vaddq_s32(q11s32, q1s32);
129 q13s32 = vsubq_s32(q2s32, q0s32);
130 q1s32 = vsubq_s32(q11s32, q1s32);
132 d16s16 = vqrshrn_n_s32(q3s32, 14);
133 d17s16 = vqrshrn_n_s32(q12s32, 14);
134 d18s16 = vqrshrn_n_s32(q13s32, 14);
135 d19s16 = vqrshrn_n_s32(q1s32, 14);
136 q8s16 = vcombine_s16(d16s16, d17s16);
137 q9s16 = vcombine_s16(d18s16, d19s16);
139 q0s32 = vmull_s16(d20s16, d31s16);
140 q1s32 = vmull_s16(d21s16, d31s16);
141 q12s32 = vmull_s16(d20s16, d30s16);
142 q13s32 = vmull_s16(d21s16, d30s16);
144 q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
145 q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
146 q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
147 q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
149 d22s16 = vqrshrn_n_s32(q0s32, 14);
150 d23s16 = vqrshrn_n_s32(q1s32, 14);
151 d20s16 = vqrshrn_n_s32(q12s32, 14);
152 d21s16 = vqrshrn_n_s32(q13s32, 14);
153 q10s16 = vcombine_s16(d20s16, d21s16);
154 q11s16 = vcombine_s16(d22s16, d23s16);
156 q13s16 = vsubq_s16(q4s16, q5s16);
157 q4s16 = vaddq_s16(q4s16, q5s16);
158 q14s16 = vsubq_s16(q7s16, q6s16);
159 q15s16 = vaddq_s16(q6s16, q7s16);
160 d26s16 = vget_low_s16(q13s16);
161 d27s16 = vget_high_s16(q13s16);
162 d28s16 = vget_low_s16(q14s16);
163 d29s16 = vget_high_s16(q14s16);
166 q0s16 = vaddq_s16(q8s16, q11s16);
167 q1s16 = vaddq_s16(q9s16, q10s16);
168 q2s16 = vsubq_s16(q9s16, q10s16);
169 q3s16 = vsubq_s16(q8s16, q11s16);
171 d16s16 = vdup_n_s16((int16_t)cospi_16_64);
173 q11s32 = vmull_s16(d26s16, d16s16);
174 q12s32 = vmull_s16(d27s16, d16s16);
175 q9s32 = vmull_s16(d28s16, d16s16);
176 q10s32 = vmull_s16(d29s16, d16s16);
178 q6s32 = vsubq_s32(q9s32, q11s32);
179 q13s32 = vsubq_s32(q10s32, q12s32);
180 q9s32 = vaddq_s32(q9s32, q11s32);
181 q10s32 = vaddq_s32(q10s32, q12s32);
183 d10s16 = vqrshrn_n_s32(q6s32, 14);
184 d11s16 = vqrshrn_n_s32(q13s32, 14);
185 d12s16 = vqrshrn_n_s32(q9s32, 14);
186 d13s16 = vqrshrn_n_s32(q10s32, 14);
187 q5s16 = vcombine_s16(d10s16, d11s16);
188 q6s16 = vcombine_s16(d12s16, d13s16);
191 q8s16 = vaddq_s16(q0s16, q15s16);
192 q9s16 = vaddq_s16(q1s16, q6s16);
193 q10s16 = vaddq_s16(q2s16, q5s16);
194 q11s16 = vaddq_s16(q3s16, q4s16);
195 q12s16 = vsubq_s16(q3s16, q4s16);
196 q13s16 = vsubq_s16(q2s16, q5s16);
197 q14s16 = vsubq_s16(q1s16, q6s16);
198 q15s16 = vsubq_s16(q0s16, q15s16);
200 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
201 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
202 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
203 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
204 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
205 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
206 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
207 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
208 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
209 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
210 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
211 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
212 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
213 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
214 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
215 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
218 output_stride >>= 1; // output_stride / 2, out is int16_t
219 vst1_u64((uint64_t *)out, d16u64);
220 out += output_stride;
221 vst1_u64((uint64_t *)out, d17u64);
222 out += output_stride;
223 vst1_u64((uint64_t *)out, d18u64);
224 out += output_stride;
225 vst1_u64((uint64_t *)out, d19u64);
226 out += output_stride;
227 vst1_u64((uint64_t *)out, d20u64);
228 out += output_stride;
229 vst1_u64((uint64_t *)out, d21u64);
230 out += output_stride;
231 vst1_u64((uint64_t *)out, d22u64);
232 out += output_stride;
233 vst1_u64((uint64_t *)out, d23u64);
234 out += output_stride;
235 vst1_u64((uint64_t *)out, d24u64);
236 out += output_stride;
237 vst1_u64((uint64_t *)out, d25u64);
238 out += output_stride;
239 vst1_u64((uint64_t *)out, d26u64);
240 out += output_stride;
241 vst1_u64((uint64_t *)out, d27u64);
242 out += output_stride;
243 vst1_u64((uint64_t *)out, d28u64);
244 out += output_stride;
245 vst1_u64((uint64_t *)out, d29u64);
246 out += output_stride;
247 vst1_u64((uint64_t *)out, d30u64);
248 out += output_stride;
249 vst1_u64((uint64_t *)out, d31u64);
252 void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
253 int16_t *pass1Output, int16_t skip_adding,
254 uint8_t *dest, int dest_stride) {
256 uint8x8_t d12u8, d13u8;
257 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
258 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
259 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
260 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
261 uint64x1_t d24u64, d25u64, d26u64, d27u64;
262 int64x1_t d12s64, d13s64;
263 uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
264 uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
265 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
266 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
267 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
268 int32x4_t q10s32, q11s32, q12s32, q13s32;
271 q0x2s16 = vld2q_s16(src);
272 q8s16 = q0x2s16.val[0];
274 q0x2s16 = vld2q_s16(src);
275 q9s16 = q0x2s16.val[0];
277 q0x2s16 = vld2q_s16(src);
278 q10s16 = q0x2s16.val[0];
280 q0x2s16 = vld2q_s16(src);
281 q11s16 = q0x2s16.val[0];
283 q0x2s16 = vld2q_s16(src);
284 q12s16 = q0x2s16.val[0];
286 q0x2s16 = vld2q_s16(src);
287 q13s16 = q0x2s16.val[0];
289 q0x2s16 = vld2q_s16(src);
290 q14s16 = q0x2s16.val[0];
292 q0x2s16 = vld2q_s16(src);
293 q15s16 = q0x2s16.val[0];
295 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
298 d16s16 = vget_low_s16(q8s16);
299 d17s16 = vget_high_s16(q8s16);
300 d18s16 = vget_low_s16(q9s16);
301 d19s16 = vget_high_s16(q9s16);
302 d20s16 = vget_low_s16(q10s16);
303 d21s16 = vget_high_s16(q10s16);
304 d22s16 = vget_low_s16(q11s16);
305 d23s16 = vget_high_s16(q11s16);
306 d24s16 = vget_low_s16(q12s16);
307 d25s16 = vget_high_s16(q12s16);
308 d26s16 = vget_low_s16(q13s16);
309 d27s16 = vget_high_s16(q13s16);
310 d28s16 = vget_low_s16(q14s16);
311 d29s16 = vget_high_s16(q14s16);
312 d30s16 = vget_low_s16(q15s16);
313 d31s16 = vget_high_s16(q15s16);
316 d12s16 = vdup_n_s16((int16_t)cospi_30_64);
317 d13s16 = vdup_n_s16((int16_t)cospi_2_64);
319 q2s32 = vmull_s16(d16s16, d12s16);
320 q3s32 = vmull_s16(d17s16, d12s16);
321 q1s32 = vmull_s16(d16s16, d13s16);
322 q4s32 = vmull_s16(d17s16, d13s16);
324 q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
325 q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
326 q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
327 q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
329 d0s16 = vqrshrn_n_s32(q2s32, 14);
330 d1s16 = vqrshrn_n_s32(q3s32, 14);
331 d14s16 = vqrshrn_n_s32(q1s32, 14);
332 d15s16 = vqrshrn_n_s32(q4s32, 14);
333 q0s16 = vcombine_s16(d0s16, d1s16);
334 q7s16 = vcombine_s16(d14s16, d15s16);
336 d30s16 = vdup_n_s16((int16_t)cospi_14_64);
337 d31s16 = vdup_n_s16((int16_t)cospi_18_64);
339 q2s32 = vmull_s16(d24s16, d30s16);
340 q3s32 = vmull_s16(d25s16, d30s16);
341 q4s32 = vmull_s16(d24s16, d31s16);
342 q5s32 = vmull_s16(d25s16, d31s16);
344 q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
345 q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
346 q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
347 q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
349 d2s16 = vqrshrn_n_s32(q2s32, 14);
350 d3s16 = vqrshrn_n_s32(q3s32, 14);
351 d12s16 = vqrshrn_n_s32(q4s32, 14);
352 d13s16 = vqrshrn_n_s32(q5s32, 14);
353 q1s16 = vcombine_s16(d2s16, d3s16);
354 q6s16 = vcombine_s16(d12s16, d13s16);
356 d30s16 = vdup_n_s16((int16_t)cospi_22_64);
357 d31s16 = vdup_n_s16((int16_t)cospi_10_64);
359 q11s32 = vmull_s16(d20s16, d30s16);
360 q12s32 = vmull_s16(d21s16, d30s16);
361 q4s32 = vmull_s16(d20s16, d31s16);
362 q5s32 = vmull_s16(d21s16, d31s16);
364 q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
365 q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
366 q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
367 q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
369 d4s16 = vqrshrn_n_s32(q11s32, 14);
370 d5s16 = vqrshrn_n_s32(q12s32, 14);
371 d11s16 = vqrshrn_n_s32(q5s32, 14);
372 d10s16 = vqrshrn_n_s32(q4s32, 14);
373 q2s16 = vcombine_s16(d4s16, d5s16);
374 q5s16 = vcombine_s16(d10s16, d11s16);
376 d30s16 = vdup_n_s16((int16_t)cospi_6_64);
377 d31s16 = vdup_n_s16((int16_t)cospi_26_64);
379 q10s32 = vmull_s16(d28s16, d30s16);
380 q11s32 = vmull_s16(d29s16, d30s16);
381 q12s32 = vmull_s16(d28s16, d31s16);
382 q13s32 = vmull_s16(d29s16, d31s16);
384 q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
385 q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
386 q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
387 q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
389 d6s16 = vqrshrn_n_s32(q10s32, 14);
390 d7s16 = vqrshrn_n_s32(q11s32, 14);
391 d8s16 = vqrshrn_n_s32(q12s32, 14);
392 d9s16 = vqrshrn_n_s32(q13s32, 14);
393 q3s16 = vcombine_s16(d6s16, d7s16);
394 q4s16 = vcombine_s16(d8s16, d9s16);
397 q9s16 = vsubq_s16(q0s16, q1s16);
398 q0s16 = vaddq_s16(q0s16, q1s16);
399 q10s16 = vsubq_s16(q3s16, q2s16);
400 q11s16 = vaddq_s16(q2s16, q3s16);
401 q12s16 = vaddq_s16(q4s16, q5s16);
402 q13s16 = vsubq_s16(q4s16, q5s16);
403 q14s16 = vsubq_s16(q7s16, q6s16);
404 q7s16 = vaddq_s16(q6s16, q7s16);
407 d18s16 = vget_low_s16(q9s16);
408 d19s16 = vget_high_s16(q9s16);
409 d20s16 = vget_low_s16(q10s16);
410 d21s16 = vget_high_s16(q10s16);
411 d26s16 = vget_low_s16(q13s16);
412 d27s16 = vget_high_s16(q13s16);
413 d28s16 = vget_low_s16(q14s16);
414 d29s16 = vget_high_s16(q14s16);
416 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
417 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
419 q2s32 = vmull_s16(d18s16, d31s16);
420 q3s32 = vmull_s16(d19s16, d31s16);
421 q4s32 = vmull_s16(d28s16, d31s16);
422 q5s32 = vmull_s16(d29s16, d31s16);
424 q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
425 q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
426 q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
427 q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
429 d12s16 = vqrshrn_n_s32(q2s32, 14);
430 d13s16 = vqrshrn_n_s32(q3s32, 14);
431 d2s16 = vqrshrn_n_s32(q4s32, 14);
432 d3s16 = vqrshrn_n_s32(q5s32, 14);
433 q1s16 = vcombine_s16(d2s16, d3s16);
434 q6s16 = vcombine_s16(d12s16, d13s16);
439 d30s16 = vdup_n_s16(-cospi_8_64);
440 q11s32 = vmull_s16(d26s16, d30s16);
441 q12s32 = vmull_s16(d27s16, d30s16);
442 q8s32 = vmull_s16(d20s16, d30s16);
443 q9s32 = vmull_s16(d21s16, d30s16);
445 q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
446 q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
447 q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
448 q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
450 d4s16 = vqrshrn_n_s32(q11s32, 14);
451 d5s16 = vqrshrn_n_s32(q12s32, 14);
452 d10s16 = vqrshrn_n_s32(q8s32, 14);
453 d11s16 = vqrshrn_n_s32(q9s32, 14);
454 q2s16 = vcombine_s16(d4s16, d5s16);
455 q5s16 = vcombine_s16(d10s16, d11s16);
458 q8s16 = vaddq_s16(q0s16, q3s16);
459 q9s16 = vaddq_s16(q1s16, q2s16);
460 q10s16 = vsubq_s16(q1s16, q2s16);
461 q11s16 = vsubq_s16(q0s16, q3s16);
462 q12s16 = vsubq_s16(q7s16, q4s16);
463 q13s16 = vsubq_s16(q6s16, q5s16);
464 q14s16 = vaddq_s16(q6s16, q5s16);
465 q15s16 = vaddq_s16(q7s16, q4s16);
468 d20s16 = vget_low_s16(q10s16);
469 d21s16 = vget_high_s16(q10s16);
470 d22s16 = vget_low_s16(q11s16);
471 d23s16 = vget_high_s16(q11s16);
472 d24s16 = vget_low_s16(q12s16);
473 d25s16 = vget_high_s16(q12s16);
474 d26s16 = vget_low_s16(q13s16);
475 d27s16 = vget_high_s16(q13s16);
477 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
479 q3s32 = vmull_s16(d26s16, d14s16);
480 q4s32 = vmull_s16(d27s16, d14s16);
481 q0s32 = vmull_s16(d20s16, d14s16);
482 q1s32 = vmull_s16(d21s16, d14s16);
484 q5s32 = vsubq_s32(q3s32, q0s32);
485 q6s32 = vsubq_s32(q4s32, q1s32);
486 q10s32 = vaddq_s32(q3s32, q0s32);
487 q4s32 = vaddq_s32(q4s32, q1s32);
489 d4s16 = vqrshrn_n_s32(q5s32, 14);
490 d5s16 = vqrshrn_n_s32(q6s32, 14);
491 d10s16 = vqrshrn_n_s32(q10s32, 14);
492 d11s16 = vqrshrn_n_s32(q4s32, 14);
493 q2s16 = vcombine_s16(d4s16, d5s16);
494 q5s16 = vcombine_s16(d10s16, d11s16);
496 q0s32 = vmull_s16(d22s16, d14s16);
497 q1s32 = vmull_s16(d23s16, d14s16);
498 q13s32 = vmull_s16(d24s16, d14s16);
499 q6s32 = vmull_s16(d25s16, d14s16);
501 q10s32 = vsubq_s32(q13s32, q0s32);
502 q4s32 = vsubq_s32(q6s32, q1s32);
503 q13s32 = vaddq_s32(q13s32, q0s32);
504 q6s32 = vaddq_s32(q6s32, q1s32);
506 d6s16 = vqrshrn_n_s32(q10s32, 14);
507 d7s16 = vqrshrn_n_s32(q4s32, 14);
508 d8s16 = vqrshrn_n_s32(q13s32, 14);
509 d9s16 = vqrshrn_n_s32(q6s32, 14);
510 q3s16 = vcombine_s16(d6s16, d7s16);
511 q4s16 = vcombine_s16(d8s16, d9s16);
514 if (skip_adding != 0) {
516 // load the data in pass1
517 q0s16 = vld1q_s16(pass1Output);
519 q1s16 = vld1q_s16(pass1Output);
521 d12s64 = vld1_s64((int64_t *)dest);
523 d13s64 = vld1_s64((int64_t *)dest);
526 q12s16 = vaddq_s16(q0s16, q15s16);
527 q13s16 = vaddq_s16(q1s16, q14s16);
528 q12s16 = vrshrq_n_s16(q12s16, 6);
529 q13s16 = vrshrq_n_s16(q13s16, 6);
531 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
533 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
534 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
535 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
536 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
538 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
540 q14s16 = vsubq_s16(q1s16, q14s16);
541 q15s16 = vsubq_s16(q0s16, q15s16);
543 q10s16 = vld1q_s16(pass1Output);
545 q11s16 = vld1q_s16(pass1Output);
547 d12s64 = vld1_s64((int64_t *)dest);
549 d13s64 = vld1_s64((int64_t *)dest);
551 q12s16 = vaddq_s16(q10s16, q5s16);
552 q13s16 = vaddq_s16(q11s16, q4s16);
553 q12s16 = vrshrq_n_s16(q12s16, 6);
554 q13s16 = vrshrq_n_s16(q13s16, 6);
556 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
558 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
559 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
560 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
561 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
563 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
565 q4s16 = vsubq_s16(q11s16, q4s16);
566 q5s16 = vsubq_s16(q10s16, q5s16);
568 q0s16 = vld1q_s16(pass1Output);
570 q1s16 = vld1q_s16(pass1Output);
572 d12s64 = vld1_s64((int64_t *)dest);
574 d13s64 = vld1_s64((int64_t *)dest);
576 q12s16 = vaddq_s16(q0s16, q3s16);
577 q13s16 = vaddq_s16(q1s16, q2s16);
578 q12s16 = vrshrq_n_s16(q12s16, 6);
579 q13s16 = vrshrq_n_s16(q13s16, 6);
581 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
583 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
584 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
585 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
586 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
588 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
590 q2s16 = vsubq_s16(q1s16, q2s16);
591 q3s16 = vsubq_s16(q0s16, q3s16);
593 q10s16 = vld1q_s16(pass1Output);
595 q11s16 = vld1q_s16(pass1Output);
596 d12s64 = vld1_s64((int64_t *)dest);
598 d13s64 = vld1_s64((int64_t *)dest);
600 q12s16 = vaddq_s16(q10s16, q9s16);
601 q13s16 = vaddq_s16(q11s16, q8s16);
602 q12s16 = vrshrq_n_s16(q12s16, 6);
603 q13s16 = vrshrq_n_s16(q13s16, 6);
605 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
607 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
608 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
609 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
610 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
612 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
614 q8s16 = vsubq_s16(q11s16, q8s16);
615 q9s16 = vsubq_s16(q10s16, q9s16);
617 // store the data out 8,9,10,11,12,13,14,15
618 d12s64 = vld1_s64((int64_t *)dest);
620 q8s16 = vrshrq_n_s16(q8s16, 6);
621 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
622 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
623 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
626 d12s64 = vld1_s64((int64_t *)dest);
628 q9s16 = vrshrq_n_s16(q9s16, 6);
629 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
630 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
631 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
634 d12s64 = vld1_s64((int64_t *)dest);
636 q2s16 = vrshrq_n_s16(q2s16, 6);
637 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
638 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
639 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
642 d12s64 = vld1_s64((int64_t *)dest);
644 q3s16 = vrshrq_n_s16(q3s16, 6);
645 q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
646 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
647 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
650 d12s64 = vld1_s64((int64_t *)dest);
652 q4s16 = vrshrq_n_s16(q4s16, 6);
653 q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
654 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
655 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
658 d12s64 = vld1_s64((int64_t *)dest);
660 q5s16 = vrshrq_n_s16(q5s16, 6);
661 q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
662 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
663 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
666 d12s64 = vld1_s64((int64_t *)dest);
668 q14s16 = vrshrq_n_s16(q14s16, 6);
670 vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
671 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
672 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
675 d12s64 = vld1_s64((int64_t *)dest);
676 q15s16 = vrshrq_n_s16(q15s16, 6);
678 vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
679 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
680 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
681 } else { // skip_adding_dest
682 q0s16 = vld1q_s16(pass1Output);
684 q1s16 = vld1q_s16(pass1Output);
686 q12s16 = vaddq_s16(q0s16, q15s16);
687 q13s16 = vaddq_s16(q1s16, q14s16);
688 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
689 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
690 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
691 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
692 vst1_u64((uint64_t *)out, d24u64);
694 vst1_u64((uint64_t *)out, d25u64);
696 vst1_u64((uint64_t *)out, d26u64);
698 vst1_u64((uint64_t *)out, d27u64);
700 q14s16 = vsubq_s16(q1s16, q14s16);
701 q15s16 = vsubq_s16(q0s16, q15s16);
703 q10s16 = vld1q_s16(pass1Output);
705 q11s16 = vld1q_s16(pass1Output);
707 q12s16 = vaddq_s16(q10s16, q5s16);
708 q13s16 = vaddq_s16(q11s16, q4s16);
709 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
710 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
711 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
712 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
713 vst1_u64((uint64_t *)out, d24u64);
715 vst1_u64((uint64_t *)out, d25u64);
717 vst1_u64((uint64_t *)out, d26u64);
719 vst1_u64((uint64_t *)out, d27u64);
721 q4s16 = vsubq_s16(q11s16, q4s16);
722 q5s16 = vsubq_s16(q10s16, q5s16);
724 q0s16 = vld1q_s16(pass1Output);
726 q1s16 = vld1q_s16(pass1Output);
728 q12s16 = vaddq_s16(q0s16, q3s16);
729 q13s16 = vaddq_s16(q1s16, q2s16);
730 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
731 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
732 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
733 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
734 vst1_u64((uint64_t *)out, d24u64);
736 vst1_u64((uint64_t *)out, d25u64);
738 vst1_u64((uint64_t *)out, d26u64);
740 vst1_u64((uint64_t *)out, d27u64);
742 q2s16 = vsubq_s16(q1s16, q2s16);
743 q3s16 = vsubq_s16(q0s16, q3s16);
745 q10s16 = vld1q_s16(pass1Output);
747 q11s16 = vld1q_s16(pass1Output);
749 q12s16 = vaddq_s16(q10s16, q9s16);
750 q13s16 = vaddq_s16(q11s16, q8s16);
751 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
752 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
753 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
754 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
755 vst1_u64((uint64_t *)out, d24u64);
757 vst1_u64((uint64_t *)out, d25u64);
759 vst1_u64((uint64_t *)out, d26u64);
761 vst1_u64((uint64_t *)out, d27u64);
763 q8s16 = vsubq_s16(q11s16, q8s16);
764 q9s16 = vsubq_s16(q10s16, q9s16);
766 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
768 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
770 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
772 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
774 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
776 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
778 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
780 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
782 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
784 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
786 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
788 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
790 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
792 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
794 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
796 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
800 void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
803 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
804 uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
805 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
806 int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
807 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
808 int32x4_t q6s32, q9s32;
809 int32x4_t q10s32, q11s32, q12s32, q15s32;
812 q0x2s16 = vld2q_s16(in);
813 q8s16 = q0x2s16.val[0];
815 q0x2s16 = vld2q_s16(in);
816 q9s16 = q0x2s16.val[0];
818 q0x2s16 = vld2q_s16(in);
819 q10s16 = q0x2s16.val[0];
821 q0x2s16 = vld2q_s16(in);
822 q11s16 = q0x2s16.val[0];
824 q0x2s16 = vld2q_s16(in);
825 q12s16 = q0x2s16.val[0];
827 q0x2s16 = vld2q_s16(in);
828 q13s16 = q0x2s16.val[0];
830 q0x2s16 = vld2q_s16(in);
831 q14s16 = q0x2s16.val[0];
833 q0x2s16 = vld2q_s16(in);
834 q15s16 = q0x2s16.val[0];
836 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
840 q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2);
841 q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2);
843 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
844 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
847 q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2);
848 d4s16 = vdup_n_s16((int16_t)cospi_16_64);
850 q8s16 = vqrdmulhq_s16(q8s16, q1s16);
852 d8s16 = vget_low_s16(q4s16);
853 d9s16 = vget_high_s16(q4s16);
854 d14s16 = vget_low_s16(q7s16);
855 d15s16 = vget_high_s16(q7s16);
856 q9s32 = vmull_s16(d14s16, d4s16);
857 q10s32 = vmull_s16(d15s16, d4s16);
858 q12s32 = vmull_s16(d9s16, d4s16);
859 q11s32 = vmull_s16(d8s16, d4s16);
861 q15s32 = vsubq_s32(q10s32, q12s32);
862 q6s32 = vsubq_s32(q9s32, q11s32);
863 q9s32 = vaddq_s32(q9s32, q11s32);
864 q10s32 = vaddq_s32(q10s32, q12s32);
866 d11s16 = vqrshrn_n_s32(q15s32, 14);
867 d10s16 = vqrshrn_n_s32(q6s32, 14);
868 d12s16 = vqrshrn_n_s32(q9s32, 14);
869 d13s16 = vqrshrn_n_s32(q10s32, 14);
870 q5s16 = vcombine_s16(d10s16, d11s16);
871 q6s16 = vcombine_s16(d12s16, d13s16);
874 q2s16 = vaddq_s16(q8s16, q7s16);
875 q9s16 = vaddq_s16(q8s16, q6s16);
876 q10s16 = vaddq_s16(q8s16, q5s16);
877 q11s16 = vaddq_s16(q8s16, q4s16);
878 q12s16 = vsubq_s16(q8s16, q4s16);
879 q13s16 = vsubq_s16(q8s16, q5s16);
880 q14s16 = vsubq_s16(q8s16, q6s16);
881 q15s16 = vsubq_s16(q8s16, q7s16);
883 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
884 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
885 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
886 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
887 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
888 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
889 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
890 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
891 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
892 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
893 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
894 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
895 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
896 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
897 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
898 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
901 output_stride >>= 1; // output_stride / 2, out is int16_t
902 vst1_u64((uint64_t *)out, d4u64);
903 out += output_stride;
904 vst1_u64((uint64_t *)out, d5u64);
905 out += output_stride;
906 vst1_u64((uint64_t *)out, d18u64);
907 out += output_stride;
908 vst1_u64((uint64_t *)out, d19u64);
909 out += output_stride;
910 vst1_u64((uint64_t *)out, d20u64);
911 out += output_stride;
912 vst1_u64((uint64_t *)out, d21u64);
913 out += output_stride;
914 vst1_u64((uint64_t *)out, d22u64);
915 out += output_stride;
916 vst1_u64((uint64_t *)out, d23u64);
917 out += output_stride;
918 vst1_u64((uint64_t *)out, d24u64);
919 out += output_stride;
920 vst1_u64((uint64_t *)out, d25u64);
921 out += output_stride;
922 vst1_u64((uint64_t *)out, d26u64);
923 out += output_stride;
924 vst1_u64((uint64_t *)out, d27u64);
925 out += output_stride;
926 vst1_u64((uint64_t *)out, d28u64);
927 out += output_stride;
928 vst1_u64((uint64_t *)out, d29u64);
929 out += output_stride;
930 vst1_u64((uint64_t *)out, d30u64);
931 out += output_stride;
932 vst1_u64((uint64_t *)out, d31u64);
935 void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
936 int16_t *pass1Output, int16_t skip_adding,
937 uint8_t *dest, int dest_stride) {
938 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
939 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
940 int16x4_t d20s16, d21s16, d22s16, d23s16;
941 int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
942 uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
943 uint64x1_t d16u64, d17u64, d18u64, d19u64;
944 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
945 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
946 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
947 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
948 int32x4_t q10s32, q11s32, q12s32, q13s32;
954 q0x2s16 = vld2q_s16(src);
955 q8s16 = q0x2s16.val[0];
957 q0x2s16 = vld2q_s16(src);
958 q9s16 = q0x2s16.val[0];
960 q0x2s16 = vld2q_s16(src);
961 q10s16 = q0x2s16.val[0];
963 q0x2s16 = vld2q_s16(src);
964 q11s16 = q0x2s16.val[0];
966 q0x2s16 = vld2q_s16(src);
967 q12s16 = q0x2s16.val[0];
969 q0x2s16 = vld2q_s16(src);
970 q13s16 = q0x2s16.val[0];
972 q0x2s16 = vld2q_s16(src);
973 q14s16 = q0x2s16.val[0];
975 q0x2s16 = vld2q_s16(src);
976 q15s16 = q0x2s16.val[0];
978 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
982 q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2);
983 q0s16 = vqrdmulhq_s16(q8s16, q6s16);
984 q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2);
985 q7s16 = vqrdmulhq_s16(q8s16, q6s16);
987 q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2);
988 q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2);
989 q3s16 = vqrdmulhq_s16(q9s16, q15s16);
990 q4s16 = vqrdmulhq_s16(q9s16, q14s16);
993 d0s16 = vget_low_s16(q0s16);
994 d1s16 = vget_high_s16(q0s16);
995 d6s16 = vget_low_s16(q3s16);
996 d7s16 = vget_high_s16(q3s16);
997 d8s16 = vget_low_s16(q4s16);
998 d9s16 = vget_high_s16(q4s16);
999 d14s16 = vget_low_s16(q7s16);
1000 d15s16 = vget_high_s16(q7s16);
1002 d30s16 = vdup_n_s16((int16_t)cospi_8_64);
1003 d31s16 = vdup_n_s16((int16_t)cospi_24_64);
1005 q12s32 = vmull_s16(d14s16, d31s16);
1006 q5s32 = vmull_s16(d15s16, d31s16);
1007 q2s32 = vmull_s16(d0s16, d31s16);
1008 q11s32 = vmull_s16(d1s16, d31s16);
1010 q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
1011 q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
1012 q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
1013 q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
1015 d2s16 = vqrshrn_n_s32(q12s32, 14);
1016 d3s16 = vqrshrn_n_s32(q5s32, 14);
1017 d12s16 = vqrshrn_n_s32(q2s32, 14);
1018 d13s16 = vqrshrn_n_s32(q11s32, 14);
1019 q1s16 = vcombine_s16(d2s16, d3s16);
1020 q6s16 = vcombine_s16(d12s16, d13s16);
1022 d30s16 = vdup_n_s16(-cospi_8_64);
1023 q10s32 = vmull_s16(d8s16, d30s16);
1024 q13s32 = vmull_s16(d9s16, d30s16);
1025 q8s32 = vmull_s16(d6s16, d30s16);
1026 q9s32 = vmull_s16(d7s16, d30s16);
1028 q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
1029 q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
1030 q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
1031 q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
1033 d4s16 = vqrshrn_n_s32(q10s32, 14);
1034 d5s16 = vqrshrn_n_s32(q13s32, 14);
1035 d10s16 = vqrshrn_n_s32(q8s32, 14);
1036 d11s16 = vqrshrn_n_s32(q9s32, 14);
1037 q2s16 = vcombine_s16(d4s16, d5s16);
1038 q5s16 = vcombine_s16(d10s16, d11s16);
1041 q8s16 = vaddq_s16(q0s16, q3s16);
1042 q9s16 = vaddq_s16(q1s16, q2s16);
1043 q10s16 = vsubq_s16(q1s16, q2s16);
1044 q11s16 = vsubq_s16(q0s16, q3s16);
1045 q12s16 = vsubq_s16(q7s16, q4s16);
1046 q13s16 = vsubq_s16(q6s16, q5s16);
1047 q14s16 = vaddq_s16(q6s16, q5s16);
1048 q15s16 = vaddq_s16(q7s16, q4s16);
1051 d20s16 = vget_low_s16(q10s16);
1052 d21s16 = vget_high_s16(q10s16);
1053 d22s16 = vget_low_s16(q11s16);
1054 d23s16 = vget_high_s16(q11s16);
1055 d24s16 = vget_low_s16(q12s16);
1056 d25s16 = vget_high_s16(q12s16);
1057 d26s16 = vget_low_s16(q13s16);
1058 d27s16 = vget_high_s16(q13s16);
1060 d14s16 = vdup_n_s16((int16_t)cospi_16_64);
1061 q3s32 = vmull_s16(d26s16, d14s16);
1062 q4s32 = vmull_s16(d27s16, d14s16);
1063 q0s32 = vmull_s16(d20s16, d14s16);
1064 q1s32 = vmull_s16(d21s16, d14s16);
1066 q5s32 = vsubq_s32(q3s32, q0s32);
1067 q6s32 = vsubq_s32(q4s32, q1s32);
1068 q0s32 = vaddq_s32(q3s32, q0s32);
1069 q4s32 = vaddq_s32(q4s32, q1s32);
1071 d4s16 = vqrshrn_n_s32(q5s32, 14);
1072 d5s16 = vqrshrn_n_s32(q6s32, 14);
1073 d10s16 = vqrshrn_n_s32(q0s32, 14);
1074 d11s16 = vqrshrn_n_s32(q4s32, 14);
1075 q2s16 = vcombine_s16(d4s16, d5s16);
1076 q5s16 = vcombine_s16(d10s16, d11s16);
1078 q0s32 = vmull_s16(d22s16, d14s16);
1079 q1s32 = vmull_s16(d23s16, d14s16);
1080 q13s32 = vmull_s16(d24s16, d14s16);
1081 q6s32 = vmull_s16(d25s16, d14s16);
1083 q10s32 = vsubq_s32(q13s32, q0s32);
1084 q4s32 = vsubq_s32(q6s32, q1s32);
1085 q13s32 = vaddq_s32(q13s32, q0s32);
1086 q6s32 = vaddq_s32(q6s32, q1s32);
1088 d6s16 = vqrshrn_n_s32(q10s32, 14);
1089 d7s16 = vqrshrn_n_s32(q4s32, 14);
1090 d8s16 = vqrshrn_n_s32(q13s32, 14);
1091 d9s16 = vqrshrn_n_s32(q6s32, 14);
1092 q3s16 = vcombine_s16(d6s16, d7s16);
1093 q4s16 = vcombine_s16(d8s16, d9s16);
1096 q0s16 = vld1q_s16(pass1Output);
1098 q1s16 = vld1q_s16(pass1Output);
1100 q12s16 = vaddq_s16(q0s16, q15s16);
1101 q13s16 = vaddq_s16(q1s16, q14s16);
1102 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1103 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1104 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1105 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1106 vst1_u64((uint64_t *)out, d24u64);
1108 vst1_u64((uint64_t *)out, d25u64);
1110 vst1_u64((uint64_t *)out, d26u64);
1112 vst1_u64((uint64_t *)out, d27u64);
1114 q14s16 = vsubq_s16(q1s16, q14s16);
1115 q15s16 = vsubq_s16(q0s16, q15s16);
1117 q10s16 = vld1q_s16(pass1Output);
1119 q11s16 = vld1q_s16(pass1Output);
1121 q12s16 = vaddq_s16(q10s16, q5s16);
1122 q13s16 = vaddq_s16(q11s16, q4s16);
1123 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1124 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1125 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1126 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1127 vst1_u64((uint64_t *)out, d24u64);
1129 vst1_u64((uint64_t *)out, d25u64);
1131 vst1_u64((uint64_t *)out, d26u64);
1133 vst1_u64((uint64_t *)out, d27u64);
1135 q4s16 = vsubq_s16(q11s16, q4s16);
1136 q5s16 = vsubq_s16(q10s16, q5s16);
1138 q0s16 = vld1q_s16(pass1Output);
1140 q1s16 = vld1q_s16(pass1Output);
1142 q12s16 = vaddq_s16(q0s16, q3s16);
1143 q13s16 = vaddq_s16(q1s16, q2s16);
1144 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1145 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1146 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1147 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1148 vst1_u64((uint64_t *)out, d24u64);
1150 vst1_u64((uint64_t *)out, d25u64);
1152 vst1_u64((uint64_t *)out, d26u64);
1154 vst1_u64((uint64_t *)out, d27u64);
1156 q2s16 = vsubq_s16(q1s16, q2s16);
1157 q3s16 = vsubq_s16(q0s16, q3s16);
1159 q10s16 = vld1q_s16(pass1Output);
1161 q11s16 = vld1q_s16(pass1Output);
1162 q12s16 = vaddq_s16(q10s16, q9s16);
1163 q13s16 = vaddq_s16(q11s16, q8s16);
1164 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1165 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1166 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1167 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1168 vst1_u64((uint64_t *)out, d24u64);
1170 vst1_u64((uint64_t *)out, d25u64);
1172 vst1_u64((uint64_t *)out, d26u64);
1174 vst1_u64((uint64_t *)out, d27u64);
1176 q8s16 = vsubq_s16(q11s16, q8s16);
1177 q9s16 = vsubq_s16(q10s16, q9s16);
1179 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
1180 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
1181 d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
1182 d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
1183 d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
1184 d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
1185 d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1186 d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1187 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1188 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1189 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1190 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1191 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1192 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1193 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1194 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1196 vst1_u64((uint64_t *)out, d16u64);
1198 vst1_u64((uint64_t *)out, d17u64);
1200 vst1_u64((uint64_t *)out, d18u64);
1202 vst1_u64((uint64_t *)out, d19u64);
1204 vst1_u64((uint64_t *)out, d4u64);
1206 vst1_u64((uint64_t *)out, d5u64);
1208 vst1_u64((uint64_t *)out, d6u64);
1210 vst1_u64((uint64_t *)out, d7u64);
1212 vst1_u64((uint64_t *)out, d8u64);
1214 vst1_u64((uint64_t *)out, d9u64);
1216 vst1_u64((uint64_t *)out, d10u64);
1218 vst1_u64((uint64_t *)out, d11u64);
1220 vst1_u64((uint64_t *)out, d28u64);
1222 vst1_u64((uint64_t *)out, d29u64);
1224 vst1_u64((uint64_t *)out, d30u64);
1226 vst1_u64((uint64_t *)out, d31u64);