2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "vpx_dsp/txfm_common.h"
16 static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16,
17 int16x8_t *q10s16, int16x8_t *q11s16,
18 int16x8_t *q12s16, int16x8_t *q13s16,
19 int16x8_t *q14s16, int16x8_t *q15s16) {
20 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
21 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
22 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
23 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
25 d16s16 = vget_low_s16(*q8s16);
26 d17s16 = vget_high_s16(*q8s16);
27 d18s16 = vget_low_s16(*q9s16);
28 d19s16 = vget_high_s16(*q9s16);
29 d20s16 = vget_low_s16(*q10s16);
30 d21s16 = vget_high_s16(*q10s16);
31 d22s16 = vget_low_s16(*q11s16);
32 d23s16 = vget_high_s16(*q11s16);
33 d24s16 = vget_low_s16(*q12s16);
34 d25s16 = vget_high_s16(*q12s16);
35 d26s16 = vget_low_s16(*q13s16);
36 d27s16 = vget_high_s16(*q13s16);
37 d28s16 = vget_low_s16(*q14s16);
38 d29s16 = vget_high_s16(*q14s16);
39 d30s16 = vget_low_s16(*q15s16);
40 d31s16 = vget_high_s16(*q15s16);
42 *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
43 *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
44 *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
45 *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
46 *q12s16 = vcombine_s16(d17s16, d25s16);
47 *q13s16 = vcombine_s16(d19s16, d27s16);
48 *q14s16 = vcombine_s16(d21s16, d29s16);
49 *q15s16 = vcombine_s16(d23s16, d31s16);
52 vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16));
54 vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16));
56 vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16));
58 vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16));
60 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
61 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
62 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
63 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
64 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
65 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
66 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
67 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
69 *q8s16 = q0x2s16.val[0];
70 *q9s16 = q0x2s16.val[1];
71 *q10s16 = q1x2s16.val[0];
72 *q11s16 = q1x2s16.val[1];
73 *q12s16 = q2x2s16.val[0];
74 *q13s16 = q2x2s16.val[1];
75 *q14s16 = q3x2s16.val[0];
76 *q15s16 = q3x2s16.val[1];
80 void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out,
82 int16x4_t d0s16, d1s16, d2s16, d3s16;
83 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
84 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
85 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
86 uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
87 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
88 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
89 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
90 int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
91 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
94 q0x2s16 = vld2q_s16(in);
95 q8s16 = q0x2s16.val[0];
97 q0x2s16 = vld2q_s16(in);
98 q9s16 = q0x2s16.val[0];
100 q0x2s16 = vld2q_s16(in);
101 q10s16 = q0x2s16.val[0];
103 q0x2s16 = vld2q_s16(in);
104 q11s16 = q0x2s16.val[0];
106 q0x2s16 = vld2q_s16(in);
107 q12s16 = q0x2s16.val[0];
109 q0x2s16 = vld2q_s16(in);
110 q13s16 = q0x2s16.val[0];
112 q0x2s16 = vld2q_s16(in);
113 q14s16 = q0x2s16.val[0];
115 q0x2s16 = vld2q_s16(in);
116 q15s16 = q0x2s16.val[0];
118 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
121 d16s16 = vget_low_s16(q8s16);
122 d17s16 = vget_high_s16(q8s16);
123 d18s16 = vget_low_s16(q9s16);
124 d19s16 = vget_high_s16(q9s16);
125 d20s16 = vget_low_s16(q10s16);
126 d21s16 = vget_high_s16(q10s16);
127 d22s16 = vget_low_s16(q11s16);
128 d23s16 = vget_high_s16(q11s16);
129 d24s16 = vget_low_s16(q12s16);
130 d25s16 = vget_high_s16(q12s16);
131 d26s16 = vget_low_s16(q13s16);
132 d27s16 = vget_high_s16(q13s16);
133 d28s16 = vget_low_s16(q14s16);
134 d29s16 = vget_high_s16(q14s16);
135 d30s16 = vget_low_s16(q15s16);
136 d31s16 = vget_high_s16(q15s16);
139 d0s16 = vdup_n_s16(cospi_28_64);
140 d1s16 = vdup_n_s16(cospi_4_64);
142 q2s32 = vmull_s16(d18s16, d0s16);
143 q3s32 = vmull_s16(d19s16, d0s16);
144 q5s32 = vmull_s16(d18s16, d1s16);
145 q6s32 = vmull_s16(d19s16, d1s16);
147 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
148 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
149 q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
150 q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
152 d2s16 = vdup_n_s16(cospi_12_64);
153 d3s16 = vdup_n_s16(cospi_20_64);
155 d8s16 = vqrshrn_n_s32(q2s32, 14);
156 d9s16 = vqrshrn_n_s32(q3s32, 14);
157 d14s16 = vqrshrn_n_s32(q5s32, 14);
158 d15s16 = vqrshrn_n_s32(q6s32, 14);
159 q4s16 = vcombine_s16(d8s16, d9s16);
160 q7s16 = vcombine_s16(d14s16, d15s16);
162 q2s32 = vmull_s16(d26s16, d2s16);
163 q3s32 = vmull_s16(d27s16, d2s16);
164 q9s32 = vmull_s16(d26s16, d3s16);
165 q15s32 = vmull_s16(d27s16, d3s16);
167 q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
168 q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
169 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
170 q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
172 d10s16 = vqrshrn_n_s32(q2s32, 14);
173 d11s16 = vqrshrn_n_s32(q3s32, 14);
174 d12s16 = vqrshrn_n_s32(q9s32, 14);
175 d13s16 = vqrshrn_n_s32(q15s32, 14);
176 q5s16 = vcombine_s16(d10s16, d11s16);
177 q6s16 = vcombine_s16(d12s16, d13s16);
180 d30s16 = vdup_n_s16(cospi_16_64);
182 q2s32 = vmull_s16(d16s16, d30s16);
183 q11s32 = vmull_s16(d17s16, d30s16);
184 q0s32 = vmull_s16(d24s16, d30s16);
185 q1s32 = vmull_s16(d25s16, d30s16);
187 d30s16 = vdup_n_s16(cospi_24_64);
188 d31s16 = vdup_n_s16(cospi_8_64);
190 q3s32 = vaddq_s32(q2s32, q0s32);
191 q12s32 = vaddq_s32(q11s32, q1s32);
192 q13s32 = vsubq_s32(q2s32, q0s32);
193 q1s32 = vsubq_s32(q11s32, q1s32);
195 d16s16 = vqrshrn_n_s32(q3s32, 14);
196 d17s16 = vqrshrn_n_s32(q12s32, 14);
197 d18s16 = vqrshrn_n_s32(q13s32, 14);
198 d19s16 = vqrshrn_n_s32(q1s32, 14);
199 q8s16 = vcombine_s16(d16s16, d17s16);
200 q9s16 = vcombine_s16(d18s16, d19s16);
202 q0s32 = vmull_s16(d20s16, d31s16);
203 q1s32 = vmull_s16(d21s16, d31s16);
204 q12s32 = vmull_s16(d20s16, d30s16);
205 q13s32 = vmull_s16(d21s16, d30s16);
207 q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
208 q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
209 q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
210 q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
212 d22s16 = vqrshrn_n_s32(q0s32, 14);
213 d23s16 = vqrshrn_n_s32(q1s32, 14);
214 d20s16 = vqrshrn_n_s32(q12s32, 14);
215 d21s16 = vqrshrn_n_s32(q13s32, 14);
216 q10s16 = vcombine_s16(d20s16, d21s16);
217 q11s16 = vcombine_s16(d22s16, d23s16);
219 q13s16 = vsubq_s16(q4s16, q5s16);
220 q4s16 = vaddq_s16(q4s16, q5s16);
221 q14s16 = vsubq_s16(q7s16, q6s16);
222 q15s16 = vaddq_s16(q6s16, q7s16);
223 d26s16 = vget_low_s16(q13s16);
224 d27s16 = vget_high_s16(q13s16);
225 d28s16 = vget_low_s16(q14s16);
226 d29s16 = vget_high_s16(q14s16);
229 q0s16 = vaddq_s16(q8s16, q11s16);
230 q1s16 = vaddq_s16(q9s16, q10s16);
231 q2s16 = vsubq_s16(q9s16, q10s16);
232 q3s16 = vsubq_s16(q8s16, q11s16);
234 d16s16 = vdup_n_s16(cospi_16_64);
236 q11s32 = vmull_s16(d26s16, d16s16);
237 q12s32 = vmull_s16(d27s16, d16s16);
238 q9s32 = vmull_s16(d28s16, d16s16);
239 q10s32 = vmull_s16(d29s16, d16s16);
241 q6s32 = vsubq_s32(q9s32, q11s32);
242 q13s32 = vsubq_s32(q10s32, q12s32);
243 q9s32 = vaddq_s32(q9s32, q11s32);
244 q10s32 = vaddq_s32(q10s32, q12s32);
246 d10s16 = vqrshrn_n_s32(q6s32, 14);
247 d11s16 = vqrshrn_n_s32(q13s32, 14);
248 d12s16 = vqrshrn_n_s32(q9s32, 14);
249 d13s16 = vqrshrn_n_s32(q10s32, 14);
250 q5s16 = vcombine_s16(d10s16, d11s16);
251 q6s16 = vcombine_s16(d12s16, d13s16);
254 q8s16 = vaddq_s16(q0s16, q15s16);
255 q9s16 = vaddq_s16(q1s16, q6s16);
256 q10s16 = vaddq_s16(q2s16, q5s16);
257 q11s16 = vaddq_s16(q3s16, q4s16);
258 q12s16 = vsubq_s16(q3s16, q4s16);
259 q13s16 = vsubq_s16(q2s16, q5s16);
260 q14s16 = vsubq_s16(q1s16, q6s16);
261 q15s16 = vsubq_s16(q0s16, q15s16);
263 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
264 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
265 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
266 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
267 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
268 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
269 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
270 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
271 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
272 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
273 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
274 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
275 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
276 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
277 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
278 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
281 output_stride >>= 1; // output_stride / 2, out is int16_t
282 vst1_u64((uint64_t *)out, d16u64);
283 out += output_stride;
284 vst1_u64((uint64_t *)out, d17u64);
285 out += output_stride;
286 vst1_u64((uint64_t *)out, d18u64);
287 out += output_stride;
288 vst1_u64((uint64_t *)out, d19u64);
289 out += output_stride;
290 vst1_u64((uint64_t *)out, d20u64);
291 out += output_stride;
292 vst1_u64((uint64_t *)out, d21u64);
293 out += output_stride;
294 vst1_u64((uint64_t *)out, d22u64);
295 out += output_stride;
296 vst1_u64((uint64_t *)out, d23u64);
297 out += output_stride;
298 vst1_u64((uint64_t *)out, d24u64);
299 out += output_stride;
300 vst1_u64((uint64_t *)out, d25u64);
301 out += output_stride;
302 vst1_u64((uint64_t *)out, d26u64);
303 out += output_stride;
304 vst1_u64((uint64_t *)out, d27u64);
305 out += output_stride;
306 vst1_u64((uint64_t *)out, d28u64);
307 out += output_stride;
308 vst1_u64((uint64_t *)out, d29u64);
309 out += output_stride;
310 vst1_u64((uint64_t *)out, d30u64);
311 out += output_stride;
312 vst1_u64((uint64_t *)out, d31u64);
316 void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out,
317 int16_t *pass1Output, int16_t skip_adding,
318 uint8_t *dest, int dest_stride) {
320 uint8x8_t d12u8, d13u8;
321 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
322 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
323 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
324 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
325 uint64x1_t d24u64, d25u64, d26u64, d27u64;
326 int64x1_t d12s64, d13s64;
327 uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
328 uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
329 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
330 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
331 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
332 int32x4_t q10s32, q11s32, q12s32, q13s32;
335 q0x2s16 = vld2q_s16(src);
336 q8s16 = q0x2s16.val[0];
338 q0x2s16 = vld2q_s16(src);
339 q9s16 = q0x2s16.val[0];
341 q0x2s16 = vld2q_s16(src);
342 q10s16 = q0x2s16.val[0];
344 q0x2s16 = vld2q_s16(src);
345 q11s16 = q0x2s16.val[0];
347 q0x2s16 = vld2q_s16(src);
348 q12s16 = q0x2s16.val[0];
350 q0x2s16 = vld2q_s16(src);
351 q13s16 = q0x2s16.val[0];
353 q0x2s16 = vld2q_s16(src);
354 q14s16 = q0x2s16.val[0];
356 q0x2s16 = vld2q_s16(src);
357 q15s16 = q0x2s16.val[0];
359 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
362 d16s16 = vget_low_s16(q8s16);
363 d17s16 = vget_high_s16(q8s16);
364 d18s16 = vget_low_s16(q9s16);
365 d19s16 = vget_high_s16(q9s16);
366 d20s16 = vget_low_s16(q10s16);
367 d21s16 = vget_high_s16(q10s16);
368 d22s16 = vget_low_s16(q11s16);
369 d23s16 = vget_high_s16(q11s16);
370 d24s16 = vget_low_s16(q12s16);
371 d25s16 = vget_high_s16(q12s16);
372 d26s16 = vget_low_s16(q13s16);
373 d27s16 = vget_high_s16(q13s16);
374 d28s16 = vget_low_s16(q14s16);
375 d29s16 = vget_high_s16(q14s16);
376 d30s16 = vget_low_s16(q15s16);
377 d31s16 = vget_high_s16(q15s16);
380 d12s16 = vdup_n_s16(cospi_30_64);
381 d13s16 = vdup_n_s16(cospi_2_64);
383 q2s32 = vmull_s16(d16s16, d12s16);
384 q3s32 = vmull_s16(d17s16, d12s16);
385 q1s32 = vmull_s16(d16s16, d13s16);
386 q4s32 = vmull_s16(d17s16, d13s16);
388 q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
389 q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
390 q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
391 q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
393 d0s16 = vqrshrn_n_s32(q2s32, 14);
394 d1s16 = vqrshrn_n_s32(q3s32, 14);
395 d14s16 = vqrshrn_n_s32(q1s32, 14);
396 d15s16 = vqrshrn_n_s32(q4s32, 14);
397 q0s16 = vcombine_s16(d0s16, d1s16);
398 q7s16 = vcombine_s16(d14s16, d15s16);
400 d30s16 = vdup_n_s16(cospi_14_64);
401 d31s16 = vdup_n_s16(cospi_18_64);
403 q2s32 = vmull_s16(d24s16, d30s16);
404 q3s32 = vmull_s16(d25s16, d30s16);
405 q4s32 = vmull_s16(d24s16, d31s16);
406 q5s32 = vmull_s16(d25s16, d31s16);
408 q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
409 q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
410 q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
411 q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
413 d2s16 = vqrshrn_n_s32(q2s32, 14);
414 d3s16 = vqrshrn_n_s32(q3s32, 14);
415 d12s16 = vqrshrn_n_s32(q4s32, 14);
416 d13s16 = vqrshrn_n_s32(q5s32, 14);
417 q1s16 = vcombine_s16(d2s16, d3s16);
418 q6s16 = vcombine_s16(d12s16, d13s16);
420 d30s16 = vdup_n_s16(cospi_22_64);
421 d31s16 = vdup_n_s16(cospi_10_64);
423 q11s32 = vmull_s16(d20s16, d30s16);
424 q12s32 = vmull_s16(d21s16, d30s16);
425 q4s32 = vmull_s16(d20s16, d31s16);
426 q5s32 = vmull_s16(d21s16, d31s16);
428 q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
429 q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
430 q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
431 q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
433 d4s16 = vqrshrn_n_s32(q11s32, 14);
434 d5s16 = vqrshrn_n_s32(q12s32, 14);
435 d11s16 = vqrshrn_n_s32(q5s32, 14);
436 d10s16 = vqrshrn_n_s32(q4s32, 14);
437 q2s16 = vcombine_s16(d4s16, d5s16);
438 q5s16 = vcombine_s16(d10s16, d11s16);
440 d30s16 = vdup_n_s16(cospi_6_64);
441 d31s16 = vdup_n_s16(cospi_26_64);
443 q10s32 = vmull_s16(d28s16, d30s16);
444 q11s32 = vmull_s16(d29s16, d30s16);
445 q12s32 = vmull_s16(d28s16, d31s16);
446 q13s32 = vmull_s16(d29s16, d31s16);
448 q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
449 q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
450 q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
451 q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
453 d6s16 = vqrshrn_n_s32(q10s32, 14);
454 d7s16 = vqrshrn_n_s32(q11s32, 14);
455 d8s16 = vqrshrn_n_s32(q12s32, 14);
456 d9s16 = vqrshrn_n_s32(q13s32, 14);
457 q3s16 = vcombine_s16(d6s16, d7s16);
458 q4s16 = vcombine_s16(d8s16, d9s16);
461 q9s16 = vsubq_s16(q0s16, q1s16);
462 q0s16 = vaddq_s16(q0s16, q1s16);
463 q10s16 = vsubq_s16(q3s16, q2s16);
464 q11s16 = vaddq_s16(q2s16, q3s16);
465 q12s16 = vaddq_s16(q4s16, q5s16);
466 q13s16 = vsubq_s16(q4s16, q5s16);
467 q14s16 = vsubq_s16(q7s16, q6s16);
468 q7s16 = vaddq_s16(q6s16, q7s16);
471 d18s16 = vget_low_s16(q9s16);
472 d19s16 = vget_high_s16(q9s16);
473 d20s16 = vget_low_s16(q10s16);
474 d21s16 = vget_high_s16(q10s16);
475 d26s16 = vget_low_s16(q13s16);
476 d27s16 = vget_high_s16(q13s16);
477 d28s16 = vget_low_s16(q14s16);
478 d29s16 = vget_high_s16(q14s16);
480 d30s16 = vdup_n_s16(cospi_8_64);
481 d31s16 = vdup_n_s16(cospi_24_64);
483 q2s32 = vmull_s16(d18s16, d31s16);
484 q3s32 = vmull_s16(d19s16, d31s16);
485 q4s32 = vmull_s16(d28s16, d31s16);
486 q5s32 = vmull_s16(d29s16, d31s16);
488 q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
489 q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
490 q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
491 q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
493 d12s16 = vqrshrn_n_s32(q2s32, 14);
494 d13s16 = vqrshrn_n_s32(q3s32, 14);
495 d2s16 = vqrshrn_n_s32(q4s32, 14);
496 d3s16 = vqrshrn_n_s32(q5s32, 14);
497 q1s16 = vcombine_s16(d2s16, d3s16);
498 q6s16 = vcombine_s16(d12s16, d13s16);
503 d30s16 = vdup_n_s16(-cospi_8_64);
504 q11s32 = vmull_s16(d26s16, d30s16);
505 q12s32 = vmull_s16(d27s16, d30s16);
506 q8s32 = vmull_s16(d20s16, d30s16);
507 q9s32 = vmull_s16(d21s16, d30s16);
509 q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
510 q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
511 q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
512 q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
514 d4s16 = vqrshrn_n_s32(q11s32, 14);
515 d5s16 = vqrshrn_n_s32(q12s32, 14);
516 d10s16 = vqrshrn_n_s32(q8s32, 14);
517 d11s16 = vqrshrn_n_s32(q9s32, 14);
518 q2s16 = vcombine_s16(d4s16, d5s16);
519 q5s16 = vcombine_s16(d10s16, d11s16);
522 q8s16 = vaddq_s16(q0s16, q3s16);
523 q9s16 = vaddq_s16(q1s16, q2s16);
524 q10s16 = vsubq_s16(q1s16, q2s16);
525 q11s16 = vsubq_s16(q0s16, q3s16);
526 q12s16 = vsubq_s16(q7s16, q4s16);
527 q13s16 = vsubq_s16(q6s16, q5s16);
528 q14s16 = vaddq_s16(q6s16, q5s16);
529 q15s16 = vaddq_s16(q7s16, q4s16);
532 d20s16 = vget_low_s16(q10s16);
533 d21s16 = vget_high_s16(q10s16);
534 d22s16 = vget_low_s16(q11s16);
535 d23s16 = vget_high_s16(q11s16);
536 d24s16 = vget_low_s16(q12s16);
537 d25s16 = vget_high_s16(q12s16);
538 d26s16 = vget_low_s16(q13s16);
539 d27s16 = vget_high_s16(q13s16);
541 d14s16 = vdup_n_s16(cospi_16_64);
543 q3s32 = vmull_s16(d26s16, d14s16);
544 q4s32 = vmull_s16(d27s16, d14s16);
545 q0s32 = vmull_s16(d20s16, d14s16);
546 q1s32 = vmull_s16(d21s16, d14s16);
548 q5s32 = vsubq_s32(q3s32, q0s32);
549 q6s32 = vsubq_s32(q4s32, q1s32);
550 q10s32 = vaddq_s32(q3s32, q0s32);
551 q4s32 = vaddq_s32(q4s32, q1s32);
553 d4s16 = vqrshrn_n_s32(q5s32, 14);
554 d5s16 = vqrshrn_n_s32(q6s32, 14);
555 d10s16 = vqrshrn_n_s32(q10s32, 14);
556 d11s16 = vqrshrn_n_s32(q4s32, 14);
557 q2s16 = vcombine_s16(d4s16, d5s16);
558 q5s16 = vcombine_s16(d10s16, d11s16);
560 q0s32 = vmull_s16(d22s16, d14s16);
561 q1s32 = vmull_s16(d23s16, d14s16);
562 q13s32 = vmull_s16(d24s16, d14s16);
563 q6s32 = vmull_s16(d25s16, d14s16);
565 q10s32 = vsubq_s32(q13s32, q0s32);
566 q4s32 = vsubq_s32(q6s32, q1s32);
567 q13s32 = vaddq_s32(q13s32, q0s32);
568 q6s32 = vaddq_s32(q6s32, q1s32);
570 d6s16 = vqrshrn_n_s32(q10s32, 14);
571 d7s16 = vqrshrn_n_s32(q4s32, 14);
572 d8s16 = vqrshrn_n_s32(q13s32, 14);
573 d9s16 = vqrshrn_n_s32(q6s32, 14);
574 q3s16 = vcombine_s16(d6s16, d7s16);
575 q4s16 = vcombine_s16(d8s16, d9s16);
578 if (skip_adding != 0) {
580 // load the data in pass1
581 q0s16 = vld1q_s16(pass1Output);
583 q1s16 = vld1q_s16(pass1Output);
585 d12s64 = vld1_s64((int64_t *)dest);
587 d13s64 = vld1_s64((int64_t *)dest);
590 q12s16 = vaddq_s16(q0s16, q15s16);
591 q13s16 = vaddq_s16(q1s16, q14s16);
592 q12s16 = vrshrq_n_s16(q12s16, 6);
593 q13s16 = vrshrq_n_s16(q13s16, 6);
595 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
597 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
598 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
599 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
600 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
602 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
604 q14s16 = vsubq_s16(q1s16, q14s16);
605 q15s16 = vsubq_s16(q0s16, q15s16);
607 q10s16 = vld1q_s16(pass1Output);
609 q11s16 = vld1q_s16(pass1Output);
611 d12s64 = vld1_s64((int64_t *)dest);
613 d13s64 = vld1_s64((int64_t *)dest);
615 q12s16 = vaddq_s16(q10s16, q5s16);
616 q13s16 = vaddq_s16(q11s16, q4s16);
617 q12s16 = vrshrq_n_s16(q12s16, 6);
618 q13s16 = vrshrq_n_s16(q13s16, 6);
620 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
622 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
623 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
624 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
625 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
627 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
629 q4s16 = vsubq_s16(q11s16, q4s16);
630 q5s16 = vsubq_s16(q10s16, q5s16);
632 q0s16 = vld1q_s16(pass1Output);
634 q1s16 = vld1q_s16(pass1Output);
636 d12s64 = vld1_s64((int64_t *)dest);
638 d13s64 = vld1_s64((int64_t *)dest);
640 q12s16 = vaddq_s16(q0s16, q3s16);
641 q13s16 = vaddq_s16(q1s16, q2s16);
642 q12s16 = vrshrq_n_s16(q12s16, 6);
643 q13s16 = vrshrq_n_s16(q13s16, 6);
645 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
647 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
648 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
649 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
650 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
652 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
654 q2s16 = vsubq_s16(q1s16, q2s16);
655 q3s16 = vsubq_s16(q0s16, q3s16);
657 q10s16 = vld1q_s16(pass1Output);
659 q11s16 = vld1q_s16(pass1Output);
660 d12s64 = vld1_s64((int64_t *)dest);
662 d13s64 = vld1_s64((int64_t *)dest);
664 q12s16 = vaddq_s16(q10s16, q9s16);
665 q13s16 = vaddq_s16(q11s16, q8s16);
666 q12s16 = vrshrq_n_s16(q12s16, 6);
667 q13s16 = vrshrq_n_s16(q13s16, 6);
669 vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64));
671 vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64));
672 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
673 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
674 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
676 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
678 q8s16 = vsubq_s16(q11s16, q8s16);
679 q9s16 = vsubq_s16(q10s16, q9s16);
681 // store the data out 8,9,10,11,12,13,14,15
682 d12s64 = vld1_s64((int64_t *)dest);
684 q8s16 = vrshrq_n_s16(q8s16, 6);
685 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64));
686 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
687 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
690 d12s64 = vld1_s64((int64_t *)dest);
692 q9s16 = vrshrq_n_s16(q9s16, 6);
693 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64));
694 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
695 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
698 d12s64 = vld1_s64((int64_t *)dest);
700 q2s16 = vrshrq_n_s16(q2s16, 6);
701 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64));
702 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
703 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
706 d12s64 = vld1_s64((int64_t *)dest);
708 q3s16 = vrshrq_n_s16(q3s16, 6);
709 q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64));
710 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
711 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
714 d12s64 = vld1_s64((int64_t *)dest);
716 q4s16 = vrshrq_n_s16(q4s16, 6);
717 q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64));
718 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
719 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
722 d12s64 = vld1_s64((int64_t *)dest);
724 q5s16 = vrshrq_n_s16(q5s16, 6);
725 q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64));
726 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
727 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
730 d12s64 = vld1_s64((int64_t *)dest);
732 q14s16 = vrshrq_n_s16(q14s16, 6);
734 vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64));
735 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
736 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
739 d12s64 = vld1_s64((int64_t *)dest);
740 q15s16 = vrshrq_n_s16(q15s16, 6);
742 vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64));
743 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
744 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
745 } else { // skip_adding_dest
746 q0s16 = vld1q_s16(pass1Output);
748 q1s16 = vld1q_s16(pass1Output);
750 q12s16 = vaddq_s16(q0s16, q15s16);
751 q13s16 = vaddq_s16(q1s16, q14s16);
752 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
753 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
754 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
755 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
756 vst1_u64((uint64_t *)out, d24u64);
758 vst1_u64((uint64_t *)out, d25u64);
760 vst1_u64((uint64_t *)out, d26u64);
762 vst1_u64((uint64_t *)out, d27u64);
764 q14s16 = vsubq_s16(q1s16, q14s16);
765 q15s16 = vsubq_s16(q0s16, q15s16);
767 q10s16 = vld1q_s16(pass1Output);
769 q11s16 = vld1q_s16(pass1Output);
771 q12s16 = vaddq_s16(q10s16, q5s16);
772 q13s16 = vaddq_s16(q11s16, q4s16);
773 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
774 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
775 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
776 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
777 vst1_u64((uint64_t *)out, d24u64);
779 vst1_u64((uint64_t *)out, d25u64);
781 vst1_u64((uint64_t *)out, d26u64);
783 vst1_u64((uint64_t *)out, d27u64);
785 q4s16 = vsubq_s16(q11s16, q4s16);
786 q5s16 = vsubq_s16(q10s16, q5s16);
788 q0s16 = vld1q_s16(pass1Output);
790 q1s16 = vld1q_s16(pass1Output);
792 q12s16 = vaddq_s16(q0s16, q3s16);
793 q13s16 = vaddq_s16(q1s16, q2s16);
794 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
795 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
796 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
797 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
798 vst1_u64((uint64_t *)out, d24u64);
800 vst1_u64((uint64_t *)out, d25u64);
802 vst1_u64((uint64_t *)out, d26u64);
804 vst1_u64((uint64_t *)out, d27u64);
806 q2s16 = vsubq_s16(q1s16, q2s16);
807 q3s16 = vsubq_s16(q0s16, q3s16);
809 q10s16 = vld1q_s16(pass1Output);
811 q11s16 = vld1q_s16(pass1Output);
813 q12s16 = vaddq_s16(q10s16, q9s16);
814 q13s16 = vaddq_s16(q11s16, q8s16);
815 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
816 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
817 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
818 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
819 vst1_u64((uint64_t *)out, d24u64);
821 vst1_u64((uint64_t *)out, d25u64);
823 vst1_u64((uint64_t *)out, d26u64);
825 vst1_u64((uint64_t *)out, d27u64);
827 q8s16 = vsubq_s16(q11s16, q8s16);
828 q9s16 = vsubq_s16(q10s16, q9s16);
830 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
832 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
834 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
836 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
838 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
840 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
842 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
844 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
846 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
848 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
850 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
852 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
854 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
856 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
858 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
860 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
865 void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out,
868 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
869 uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
870 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
871 int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
872 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
873 int32x4_t q6s32, q9s32;
874 int32x4_t q10s32, q11s32, q12s32, q15s32;
877 q0x2s16 = vld2q_s16(in);
878 q8s16 = q0x2s16.val[0];
880 q0x2s16 = vld2q_s16(in);
881 q9s16 = q0x2s16.val[0];
883 q0x2s16 = vld2q_s16(in);
884 q10s16 = q0x2s16.val[0];
886 q0x2s16 = vld2q_s16(in);
887 q11s16 = q0x2s16.val[0];
889 q0x2s16 = vld2q_s16(in);
890 q12s16 = q0x2s16.val[0];
892 q0x2s16 = vld2q_s16(in);
893 q13s16 = q0x2s16.val[0];
895 q0x2s16 = vld2q_s16(in);
896 q14s16 = q0x2s16.val[0];
898 q0x2s16 = vld2q_s16(in);
899 q15s16 = q0x2s16.val[0];
901 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
905 q0s16 = vdupq_n_s16(cospi_28_64 * 2);
906 q1s16 = vdupq_n_s16(cospi_4_64 * 2);
908 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
909 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
912 q1s16 = vdupq_n_s16(cospi_16_64 * 2);
913 d4s16 = vdup_n_s16(cospi_16_64);
915 q8s16 = vqrdmulhq_s16(q8s16, q1s16);
917 d8s16 = vget_low_s16(q4s16);
918 d9s16 = vget_high_s16(q4s16);
919 d14s16 = vget_low_s16(q7s16);
920 d15s16 = vget_high_s16(q7s16);
921 q9s32 = vmull_s16(d14s16, d4s16);
922 q10s32 = vmull_s16(d15s16, d4s16);
923 q12s32 = vmull_s16(d9s16, d4s16);
924 q11s32 = vmull_s16(d8s16, d4s16);
926 q15s32 = vsubq_s32(q10s32, q12s32);
927 q6s32 = vsubq_s32(q9s32, q11s32);
928 q9s32 = vaddq_s32(q9s32, q11s32);
929 q10s32 = vaddq_s32(q10s32, q12s32);
931 d11s16 = vqrshrn_n_s32(q15s32, 14);
932 d10s16 = vqrshrn_n_s32(q6s32, 14);
933 d12s16 = vqrshrn_n_s32(q9s32, 14);
934 d13s16 = vqrshrn_n_s32(q10s32, 14);
935 q5s16 = vcombine_s16(d10s16, d11s16);
936 q6s16 = vcombine_s16(d12s16, d13s16);
939 q2s16 = vaddq_s16(q8s16, q7s16);
940 q9s16 = vaddq_s16(q8s16, q6s16);
941 q10s16 = vaddq_s16(q8s16, q5s16);
942 q11s16 = vaddq_s16(q8s16, q4s16);
943 q12s16 = vsubq_s16(q8s16, q4s16);
944 q13s16 = vsubq_s16(q8s16, q5s16);
945 q14s16 = vsubq_s16(q8s16, q6s16);
946 q15s16 = vsubq_s16(q8s16, q7s16);
948 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
949 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
950 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
951 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
952 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
953 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
954 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
955 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
956 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
957 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
958 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
959 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
960 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
961 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
962 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
963 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
966 output_stride >>= 1; // output_stride / 2, out is int16_t
967 vst1_u64((uint64_t *)out, d4u64);
968 out += output_stride;
969 vst1_u64((uint64_t *)out, d5u64);
970 out += output_stride;
971 vst1_u64((uint64_t *)out, d18u64);
972 out += output_stride;
973 vst1_u64((uint64_t *)out, d19u64);
974 out += output_stride;
975 vst1_u64((uint64_t *)out, d20u64);
976 out += output_stride;
977 vst1_u64((uint64_t *)out, d21u64);
978 out += output_stride;
979 vst1_u64((uint64_t *)out, d22u64);
980 out += output_stride;
981 vst1_u64((uint64_t *)out, d23u64);
982 out += output_stride;
983 vst1_u64((uint64_t *)out, d24u64);
984 out += output_stride;
985 vst1_u64((uint64_t *)out, d25u64);
986 out += output_stride;
987 vst1_u64((uint64_t *)out, d26u64);
988 out += output_stride;
989 vst1_u64((uint64_t *)out, d27u64);
990 out += output_stride;
991 vst1_u64((uint64_t *)out, d28u64);
992 out += output_stride;
993 vst1_u64((uint64_t *)out, d29u64);
994 out += output_stride;
995 vst1_u64((uint64_t *)out, d30u64);
996 out += output_stride;
997 vst1_u64((uint64_t *)out, d31u64);
1001 void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out,
1002 int16_t *pass1Output, int16_t skip_adding,
1003 uint8_t *dest, int dest_stride) {
1004 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
1005 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
1006 int16x4_t d20s16, d21s16, d22s16, d23s16;
1007 int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
1008 uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
1009 uint64x1_t d16u64, d17u64, d18u64, d19u64;
1010 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
1011 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
1012 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
1013 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
1014 int32x4_t q10s32, q11s32, q12s32, q13s32;
1015 int16x8x2_t q0x2s16;
1020 q0x2s16 = vld2q_s16(src);
1021 q8s16 = q0x2s16.val[0];
1023 q0x2s16 = vld2q_s16(src);
1024 q9s16 = q0x2s16.val[0];
1026 q0x2s16 = vld2q_s16(src);
1027 q10s16 = q0x2s16.val[0];
1029 q0x2s16 = vld2q_s16(src);
1030 q11s16 = q0x2s16.val[0];
1032 q0x2s16 = vld2q_s16(src);
1033 q12s16 = q0x2s16.val[0];
1035 q0x2s16 = vld2q_s16(src);
1036 q13s16 = q0x2s16.val[0];
1038 q0x2s16 = vld2q_s16(src);
1039 q14s16 = q0x2s16.val[0];
1041 q0x2s16 = vld2q_s16(src);
1042 q15s16 = q0x2s16.val[0];
1044 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
1048 q6s16 = vdupq_n_s16(cospi_30_64 * 2);
1049 q0s16 = vqrdmulhq_s16(q8s16, q6s16);
1050 q6s16 = vdupq_n_s16(cospi_2_64 * 2);
1051 q7s16 = vqrdmulhq_s16(q8s16, q6s16);
1053 q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
1054 q14s16 = vdupq_n_s16(cospi_6_64 * 2);
1055 q3s16 = vqrdmulhq_s16(q9s16, q15s16);
1056 q4s16 = vqrdmulhq_s16(q9s16, q14s16);
1059 d0s16 = vget_low_s16(q0s16);
1060 d1s16 = vget_high_s16(q0s16);
1061 d6s16 = vget_low_s16(q3s16);
1062 d7s16 = vget_high_s16(q3s16);
1063 d8s16 = vget_low_s16(q4s16);
1064 d9s16 = vget_high_s16(q4s16);
1065 d14s16 = vget_low_s16(q7s16);
1066 d15s16 = vget_high_s16(q7s16);
1068 d30s16 = vdup_n_s16(cospi_8_64);
1069 d31s16 = vdup_n_s16(cospi_24_64);
1071 q12s32 = vmull_s16(d14s16, d31s16);
1072 q5s32 = vmull_s16(d15s16, d31s16);
1073 q2s32 = vmull_s16(d0s16, d31s16);
1074 q11s32 = vmull_s16(d1s16, d31s16);
1076 q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
1077 q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
1078 q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
1079 q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
1081 d2s16 = vqrshrn_n_s32(q12s32, 14);
1082 d3s16 = vqrshrn_n_s32(q5s32, 14);
1083 d12s16 = vqrshrn_n_s32(q2s32, 14);
1084 d13s16 = vqrshrn_n_s32(q11s32, 14);
1085 q1s16 = vcombine_s16(d2s16, d3s16);
1086 q6s16 = vcombine_s16(d12s16, d13s16);
1088 d30s16 = vdup_n_s16(-cospi_8_64);
1089 q10s32 = vmull_s16(d8s16, d30s16);
1090 q13s32 = vmull_s16(d9s16, d30s16);
1091 q8s32 = vmull_s16(d6s16, d30s16);
1092 q9s32 = vmull_s16(d7s16, d30s16);
1094 q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
1095 q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
1096 q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
1097 q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
1099 d4s16 = vqrshrn_n_s32(q10s32, 14);
1100 d5s16 = vqrshrn_n_s32(q13s32, 14);
1101 d10s16 = vqrshrn_n_s32(q8s32, 14);
1102 d11s16 = vqrshrn_n_s32(q9s32, 14);
1103 q2s16 = vcombine_s16(d4s16, d5s16);
1104 q5s16 = vcombine_s16(d10s16, d11s16);
1107 q8s16 = vaddq_s16(q0s16, q3s16);
1108 q9s16 = vaddq_s16(q1s16, q2s16);
1109 q10s16 = vsubq_s16(q1s16, q2s16);
1110 q11s16 = vsubq_s16(q0s16, q3s16);
1111 q12s16 = vsubq_s16(q7s16, q4s16);
1112 q13s16 = vsubq_s16(q6s16, q5s16);
1113 q14s16 = vaddq_s16(q6s16, q5s16);
1114 q15s16 = vaddq_s16(q7s16, q4s16);
1117 d20s16 = vget_low_s16(q10s16);
1118 d21s16 = vget_high_s16(q10s16);
1119 d22s16 = vget_low_s16(q11s16);
1120 d23s16 = vget_high_s16(q11s16);
1121 d24s16 = vget_low_s16(q12s16);
1122 d25s16 = vget_high_s16(q12s16);
1123 d26s16 = vget_low_s16(q13s16);
1124 d27s16 = vget_high_s16(q13s16);
1126 d14s16 = vdup_n_s16(cospi_16_64);
1127 q3s32 = vmull_s16(d26s16, d14s16);
1128 q4s32 = vmull_s16(d27s16, d14s16);
1129 q0s32 = vmull_s16(d20s16, d14s16);
1130 q1s32 = vmull_s16(d21s16, d14s16);
1132 q5s32 = vsubq_s32(q3s32, q0s32);
1133 q6s32 = vsubq_s32(q4s32, q1s32);
1134 q0s32 = vaddq_s32(q3s32, q0s32);
1135 q4s32 = vaddq_s32(q4s32, q1s32);
1137 d4s16 = vqrshrn_n_s32(q5s32, 14);
1138 d5s16 = vqrshrn_n_s32(q6s32, 14);
1139 d10s16 = vqrshrn_n_s32(q0s32, 14);
1140 d11s16 = vqrshrn_n_s32(q4s32, 14);
1141 q2s16 = vcombine_s16(d4s16, d5s16);
1142 q5s16 = vcombine_s16(d10s16, d11s16);
1144 q0s32 = vmull_s16(d22s16, d14s16);
1145 q1s32 = vmull_s16(d23s16, d14s16);
1146 q13s32 = vmull_s16(d24s16, d14s16);
1147 q6s32 = vmull_s16(d25s16, d14s16);
1149 q10s32 = vsubq_s32(q13s32, q0s32);
1150 q4s32 = vsubq_s32(q6s32, q1s32);
1151 q13s32 = vaddq_s32(q13s32, q0s32);
1152 q6s32 = vaddq_s32(q6s32, q1s32);
1154 d6s16 = vqrshrn_n_s32(q10s32, 14);
1155 d7s16 = vqrshrn_n_s32(q4s32, 14);
1156 d8s16 = vqrshrn_n_s32(q13s32, 14);
1157 d9s16 = vqrshrn_n_s32(q6s32, 14);
1158 q3s16 = vcombine_s16(d6s16, d7s16);
1159 q4s16 = vcombine_s16(d8s16, d9s16);
1162 q0s16 = vld1q_s16(pass1Output);
1164 q1s16 = vld1q_s16(pass1Output);
1166 q12s16 = vaddq_s16(q0s16, q15s16);
1167 q13s16 = vaddq_s16(q1s16, q14s16);
1168 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1169 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1170 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1171 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1172 vst1_u64((uint64_t *)out, d24u64);
1174 vst1_u64((uint64_t *)out, d25u64);
1176 vst1_u64((uint64_t *)out, d26u64);
1178 vst1_u64((uint64_t *)out, d27u64);
1180 q14s16 = vsubq_s16(q1s16, q14s16);
1181 q15s16 = vsubq_s16(q0s16, q15s16);
1183 q10s16 = vld1q_s16(pass1Output);
1185 q11s16 = vld1q_s16(pass1Output);
1187 q12s16 = vaddq_s16(q10s16, q5s16);
1188 q13s16 = vaddq_s16(q11s16, q4s16);
1189 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1190 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1191 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1192 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1193 vst1_u64((uint64_t *)out, d24u64);
1195 vst1_u64((uint64_t *)out, d25u64);
1197 vst1_u64((uint64_t *)out, d26u64);
1199 vst1_u64((uint64_t *)out, d27u64);
1201 q4s16 = vsubq_s16(q11s16, q4s16);
1202 q5s16 = vsubq_s16(q10s16, q5s16);
1204 q0s16 = vld1q_s16(pass1Output);
1206 q1s16 = vld1q_s16(pass1Output);
1208 q12s16 = vaddq_s16(q0s16, q3s16);
1209 q13s16 = vaddq_s16(q1s16, q2s16);
1210 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1211 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1212 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1213 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1214 vst1_u64((uint64_t *)out, d24u64);
1216 vst1_u64((uint64_t *)out, d25u64);
1218 vst1_u64((uint64_t *)out, d26u64);
1220 vst1_u64((uint64_t *)out, d27u64);
1222 q2s16 = vsubq_s16(q1s16, q2s16);
1223 q3s16 = vsubq_s16(q0s16, q3s16);
1225 q10s16 = vld1q_s16(pass1Output);
1227 q11s16 = vld1q_s16(pass1Output);
1228 q12s16 = vaddq_s16(q10s16, q9s16);
1229 q13s16 = vaddq_s16(q11s16, q8s16);
1230 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1231 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1232 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1233 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1234 vst1_u64((uint64_t *)out, d24u64);
1236 vst1_u64((uint64_t *)out, d25u64);
1238 vst1_u64((uint64_t *)out, d26u64);
1240 vst1_u64((uint64_t *)out, d27u64);
1242 q8s16 = vsubq_s16(q11s16, q8s16);
1243 q9s16 = vsubq_s16(q10s16, q9s16);
1245 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
1246 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
1247 d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
1248 d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
1249 d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
1250 d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
1251 d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1252 d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1253 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1254 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1255 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1256 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1257 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1258 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1259 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1260 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1262 vst1_u64((uint64_t *)out, d16u64);
1264 vst1_u64((uint64_t *)out, d17u64);
1266 vst1_u64((uint64_t *)out, d18u64);
1268 vst1_u64((uint64_t *)out, d19u64);
1270 vst1_u64((uint64_t *)out, d4u64);
1272 vst1_u64((uint64_t *)out, d5u64);
1274 vst1_u64((uint64_t *)out, d6u64);
1276 vst1_u64((uint64_t *)out, d7u64);
1278 vst1_u64((uint64_t *)out, d8u64);
1280 vst1_u64((uint64_t *)out, d9u64);
1282 vst1_u64((uint64_t *)out, d10u64);
1284 vst1_u64((uint64_t *)out, d11u64);
1286 vst1_u64((uint64_t *)out, d28u64);
1288 vst1_u64((uint64_t *)out, d29u64);
1290 vst1_u64((uint64_t *)out, d30u64);
1292 vst1_u64((uint64_t *)out, d31u64);