2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "vpx_dsp/txfm_common.h"
16 static INLINE void TRANSPOSE8X8(
25 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
26 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
27 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
28 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
30 d16s16 = vget_low_s16(*q8s16);
31 d17s16 = vget_high_s16(*q8s16);
32 d18s16 = vget_low_s16(*q9s16);
33 d19s16 = vget_high_s16(*q9s16);
34 d20s16 = vget_low_s16(*q10s16);
35 d21s16 = vget_high_s16(*q10s16);
36 d22s16 = vget_low_s16(*q11s16);
37 d23s16 = vget_high_s16(*q11s16);
38 d24s16 = vget_low_s16(*q12s16);
39 d25s16 = vget_high_s16(*q12s16);
40 d26s16 = vget_low_s16(*q13s16);
41 d27s16 = vget_high_s16(*q13s16);
42 d28s16 = vget_low_s16(*q14s16);
43 d29s16 = vget_high_s16(*q14s16);
44 d30s16 = vget_low_s16(*q15s16);
45 d31s16 = vget_high_s16(*q15s16);
47 *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
48 *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
49 *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
50 *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
51 *q12s16 = vcombine_s16(d17s16, d25s16);
52 *q13s16 = vcombine_s16(d19s16, d27s16);
53 *q14s16 = vcombine_s16(d21s16, d29s16);
54 *q15s16 = vcombine_s16(d23s16, d31s16);
56 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
57 vreinterpretq_s32_s16(*q10s16));
58 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
59 vreinterpretq_s32_s16(*q11s16));
60 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
61 vreinterpretq_s32_s16(*q14s16));
62 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
63 vreinterpretq_s32_s16(*q15s16));
65 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
66 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
67 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
68 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
69 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
70 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
71 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
72 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
74 *q8s16 = q0x2s16.val[0];
75 *q9s16 = q0x2s16.val[1];
76 *q10s16 = q1x2s16.val[0];
77 *q11s16 = q1x2s16.val[1];
78 *q12s16 = q2x2s16.val[0];
79 *q13s16 = q2x2s16.val[1];
80 *q14s16 = q3x2s16.val[0];
81 *q15s16 = q3x2s16.val[1];
85 void vpx_idct16x16_256_add_neon_pass1(
89 int16x4_t d0s16, d1s16, d2s16, d3s16;
90 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
91 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
92 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
93 uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
94 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
95 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
96 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
97 int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
98 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
101 q0x2s16 = vld2q_s16(in);
102 q8s16 = q0x2s16.val[0];
104 q0x2s16 = vld2q_s16(in);
105 q9s16 = q0x2s16.val[0];
107 q0x2s16 = vld2q_s16(in);
108 q10s16 = q0x2s16.val[0];
110 q0x2s16 = vld2q_s16(in);
111 q11s16 = q0x2s16.val[0];
113 q0x2s16 = vld2q_s16(in);
114 q12s16 = q0x2s16.val[0];
116 q0x2s16 = vld2q_s16(in);
117 q13s16 = q0x2s16.val[0];
119 q0x2s16 = vld2q_s16(in);
120 q14s16 = q0x2s16.val[0];
122 q0x2s16 = vld2q_s16(in);
123 q15s16 = q0x2s16.val[0];
125 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
126 &q12s16, &q13s16, &q14s16, &q15s16);
128 d16s16 = vget_low_s16(q8s16);
129 d17s16 = vget_high_s16(q8s16);
130 d18s16 = vget_low_s16(q9s16);
131 d19s16 = vget_high_s16(q9s16);
132 d20s16 = vget_low_s16(q10s16);
133 d21s16 = vget_high_s16(q10s16);
134 d22s16 = vget_low_s16(q11s16);
135 d23s16 = vget_high_s16(q11s16);
136 d24s16 = vget_low_s16(q12s16);
137 d25s16 = vget_high_s16(q12s16);
138 d26s16 = vget_low_s16(q13s16);
139 d27s16 = vget_high_s16(q13s16);
140 d28s16 = vget_low_s16(q14s16);
141 d29s16 = vget_high_s16(q14s16);
142 d30s16 = vget_low_s16(q15s16);
143 d31s16 = vget_high_s16(q15s16);
146 d0s16 = vdup_n_s16(cospi_28_64);
147 d1s16 = vdup_n_s16(cospi_4_64);
149 q2s32 = vmull_s16(d18s16, d0s16);
150 q3s32 = vmull_s16(d19s16, d0s16);
151 q5s32 = vmull_s16(d18s16, d1s16);
152 q6s32 = vmull_s16(d19s16, d1s16);
154 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
155 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
156 q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
157 q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
159 d2s16 = vdup_n_s16(cospi_12_64);
160 d3s16 = vdup_n_s16(cospi_20_64);
162 d8s16 = vqrshrn_n_s32(q2s32, 14);
163 d9s16 = vqrshrn_n_s32(q3s32, 14);
164 d14s16 = vqrshrn_n_s32(q5s32, 14);
165 d15s16 = vqrshrn_n_s32(q6s32, 14);
166 q4s16 = vcombine_s16(d8s16, d9s16);
167 q7s16 = vcombine_s16(d14s16, d15s16);
169 q2s32 = vmull_s16(d26s16, d2s16);
170 q3s32 = vmull_s16(d27s16, d2s16);
171 q9s32 = vmull_s16(d26s16, d3s16);
172 q15s32 = vmull_s16(d27s16, d3s16);
174 q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
175 q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
176 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
177 q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
179 d10s16 = vqrshrn_n_s32(q2s32, 14);
180 d11s16 = vqrshrn_n_s32(q3s32, 14);
181 d12s16 = vqrshrn_n_s32(q9s32, 14);
182 d13s16 = vqrshrn_n_s32(q15s32, 14);
183 q5s16 = vcombine_s16(d10s16, d11s16);
184 q6s16 = vcombine_s16(d12s16, d13s16);
187 d30s16 = vdup_n_s16(cospi_16_64);
189 q2s32 = vmull_s16(d16s16, d30s16);
190 q11s32 = vmull_s16(d17s16, d30s16);
191 q0s32 = vmull_s16(d24s16, d30s16);
192 q1s32 = vmull_s16(d25s16, d30s16);
194 d30s16 = vdup_n_s16(cospi_24_64);
195 d31s16 = vdup_n_s16(cospi_8_64);
197 q3s32 = vaddq_s32(q2s32, q0s32);
198 q12s32 = vaddq_s32(q11s32, q1s32);
199 q13s32 = vsubq_s32(q2s32, q0s32);
200 q1s32 = vsubq_s32(q11s32, q1s32);
202 d16s16 = vqrshrn_n_s32(q3s32, 14);
203 d17s16 = vqrshrn_n_s32(q12s32, 14);
204 d18s16 = vqrshrn_n_s32(q13s32, 14);
205 d19s16 = vqrshrn_n_s32(q1s32, 14);
206 q8s16 = vcombine_s16(d16s16, d17s16);
207 q9s16 = vcombine_s16(d18s16, d19s16);
209 q0s32 = vmull_s16(d20s16, d31s16);
210 q1s32 = vmull_s16(d21s16, d31s16);
211 q12s32 = vmull_s16(d20s16, d30s16);
212 q13s32 = vmull_s16(d21s16, d30s16);
214 q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
215 q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
216 q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
217 q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
219 d22s16 = vqrshrn_n_s32(q0s32, 14);
220 d23s16 = vqrshrn_n_s32(q1s32, 14);
221 d20s16 = vqrshrn_n_s32(q12s32, 14);
222 d21s16 = vqrshrn_n_s32(q13s32, 14);
223 q10s16 = vcombine_s16(d20s16, d21s16);
224 q11s16 = vcombine_s16(d22s16, d23s16);
226 q13s16 = vsubq_s16(q4s16, q5s16);
227 q4s16 = vaddq_s16(q4s16, q5s16);
228 q14s16 = vsubq_s16(q7s16, q6s16);
229 q15s16 = vaddq_s16(q6s16, q7s16);
230 d26s16 = vget_low_s16(q13s16);
231 d27s16 = vget_high_s16(q13s16);
232 d28s16 = vget_low_s16(q14s16);
233 d29s16 = vget_high_s16(q14s16);
236 q0s16 = vaddq_s16(q8s16, q11s16);
237 q1s16 = vaddq_s16(q9s16, q10s16);
238 q2s16 = vsubq_s16(q9s16, q10s16);
239 q3s16 = vsubq_s16(q8s16, q11s16);
241 d16s16 = vdup_n_s16(cospi_16_64);
243 q11s32 = vmull_s16(d26s16, d16s16);
244 q12s32 = vmull_s16(d27s16, d16s16);
245 q9s32 = vmull_s16(d28s16, d16s16);
246 q10s32 = vmull_s16(d29s16, d16s16);
248 q6s32 = vsubq_s32(q9s32, q11s32);
249 q13s32 = vsubq_s32(q10s32, q12s32);
250 q9s32 = vaddq_s32(q9s32, q11s32);
251 q10s32 = vaddq_s32(q10s32, q12s32);
253 d10s16 = vqrshrn_n_s32(q6s32, 14);
254 d11s16 = vqrshrn_n_s32(q13s32, 14);
255 d12s16 = vqrshrn_n_s32(q9s32, 14);
256 d13s16 = vqrshrn_n_s32(q10s32, 14);
257 q5s16 = vcombine_s16(d10s16, d11s16);
258 q6s16 = vcombine_s16(d12s16, d13s16);
261 q8s16 = vaddq_s16(q0s16, q15s16);
262 q9s16 = vaddq_s16(q1s16, q6s16);
263 q10s16 = vaddq_s16(q2s16, q5s16);
264 q11s16 = vaddq_s16(q3s16, q4s16);
265 q12s16 = vsubq_s16(q3s16, q4s16);
266 q13s16 = vsubq_s16(q2s16, q5s16);
267 q14s16 = vsubq_s16(q1s16, q6s16);
268 q15s16 = vsubq_s16(q0s16, q15s16);
270 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
271 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
272 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
273 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
274 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
275 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
276 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
277 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
278 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
279 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
280 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
281 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
282 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
283 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
284 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
285 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
288 output_stride >>= 1; // output_stride / 2, out is int16_t
289 vst1_u64((uint64_t *)out, d16u64);
290 out += output_stride;
291 vst1_u64((uint64_t *)out, d17u64);
292 out += output_stride;
293 vst1_u64((uint64_t *)out, d18u64);
294 out += output_stride;
295 vst1_u64((uint64_t *)out, d19u64);
296 out += output_stride;
297 vst1_u64((uint64_t *)out, d20u64);
298 out += output_stride;
299 vst1_u64((uint64_t *)out, d21u64);
300 out += output_stride;
301 vst1_u64((uint64_t *)out, d22u64);
302 out += output_stride;
303 vst1_u64((uint64_t *)out, d23u64);
304 out += output_stride;
305 vst1_u64((uint64_t *)out, d24u64);
306 out += output_stride;
307 vst1_u64((uint64_t *)out, d25u64);
308 out += output_stride;
309 vst1_u64((uint64_t *)out, d26u64);
310 out += output_stride;
311 vst1_u64((uint64_t *)out, d27u64);
312 out += output_stride;
313 vst1_u64((uint64_t *)out, d28u64);
314 out += output_stride;
315 vst1_u64((uint64_t *)out, d29u64);
316 out += output_stride;
317 vst1_u64((uint64_t *)out, d30u64);
318 out += output_stride;
319 vst1_u64((uint64_t *)out, d31u64);
323 void vpx_idct16x16_256_add_neon_pass2(
326 int16_t *pass1Output,
331 uint8x8_t d12u8, d13u8;
332 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
333 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
334 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
335 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
336 uint64x1_t d24u64, d25u64, d26u64, d27u64;
337 int64x1_t d12s64, d13s64;
338 uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
339 uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
340 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
341 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
342 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
343 int32x4_t q10s32, q11s32, q12s32, q13s32;
346 q0x2s16 = vld2q_s16(src);
347 q8s16 = q0x2s16.val[0];
349 q0x2s16 = vld2q_s16(src);
350 q9s16 = q0x2s16.val[0];
352 q0x2s16 = vld2q_s16(src);
353 q10s16 = q0x2s16.val[0];
355 q0x2s16 = vld2q_s16(src);
356 q11s16 = q0x2s16.val[0];
358 q0x2s16 = vld2q_s16(src);
359 q12s16 = q0x2s16.val[0];
361 q0x2s16 = vld2q_s16(src);
362 q13s16 = q0x2s16.val[0];
364 q0x2s16 = vld2q_s16(src);
365 q14s16 = q0x2s16.val[0];
367 q0x2s16 = vld2q_s16(src);
368 q15s16 = q0x2s16.val[0];
370 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
371 &q12s16, &q13s16, &q14s16, &q15s16);
373 d16s16 = vget_low_s16(q8s16);
374 d17s16 = vget_high_s16(q8s16);
375 d18s16 = vget_low_s16(q9s16);
376 d19s16 = vget_high_s16(q9s16);
377 d20s16 = vget_low_s16(q10s16);
378 d21s16 = vget_high_s16(q10s16);
379 d22s16 = vget_low_s16(q11s16);
380 d23s16 = vget_high_s16(q11s16);
381 d24s16 = vget_low_s16(q12s16);
382 d25s16 = vget_high_s16(q12s16);
383 d26s16 = vget_low_s16(q13s16);
384 d27s16 = vget_high_s16(q13s16);
385 d28s16 = vget_low_s16(q14s16);
386 d29s16 = vget_high_s16(q14s16);
387 d30s16 = vget_low_s16(q15s16);
388 d31s16 = vget_high_s16(q15s16);
391 d12s16 = vdup_n_s16(cospi_30_64);
392 d13s16 = vdup_n_s16(cospi_2_64);
394 q2s32 = vmull_s16(d16s16, d12s16);
395 q3s32 = vmull_s16(d17s16, d12s16);
396 q1s32 = vmull_s16(d16s16, d13s16);
397 q4s32 = vmull_s16(d17s16, d13s16);
399 q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
400 q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
401 q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
402 q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
404 d0s16 = vqrshrn_n_s32(q2s32, 14);
405 d1s16 = vqrshrn_n_s32(q3s32, 14);
406 d14s16 = vqrshrn_n_s32(q1s32, 14);
407 d15s16 = vqrshrn_n_s32(q4s32, 14);
408 q0s16 = vcombine_s16(d0s16, d1s16);
409 q7s16 = vcombine_s16(d14s16, d15s16);
411 d30s16 = vdup_n_s16(cospi_14_64);
412 d31s16 = vdup_n_s16(cospi_18_64);
414 q2s32 = vmull_s16(d24s16, d30s16);
415 q3s32 = vmull_s16(d25s16, d30s16);
416 q4s32 = vmull_s16(d24s16, d31s16);
417 q5s32 = vmull_s16(d25s16, d31s16);
419 q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
420 q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
421 q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
422 q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
424 d2s16 = vqrshrn_n_s32(q2s32, 14);
425 d3s16 = vqrshrn_n_s32(q3s32, 14);
426 d12s16 = vqrshrn_n_s32(q4s32, 14);
427 d13s16 = vqrshrn_n_s32(q5s32, 14);
428 q1s16 = vcombine_s16(d2s16, d3s16);
429 q6s16 = vcombine_s16(d12s16, d13s16);
431 d30s16 = vdup_n_s16(cospi_22_64);
432 d31s16 = vdup_n_s16(cospi_10_64);
434 q11s32 = vmull_s16(d20s16, d30s16);
435 q12s32 = vmull_s16(d21s16, d30s16);
436 q4s32 = vmull_s16(d20s16, d31s16);
437 q5s32 = vmull_s16(d21s16, d31s16);
439 q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
440 q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
441 q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
442 q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
444 d4s16 = vqrshrn_n_s32(q11s32, 14);
445 d5s16 = vqrshrn_n_s32(q12s32, 14);
446 d11s16 = vqrshrn_n_s32(q5s32, 14);
447 d10s16 = vqrshrn_n_s32(q4s32, 14);
448 q2s16 = vcombine_s16(d4s16, d5s16);
449 q5s16 = vcombine_s16(d10s16, d11s16);
451 d30s16 = vdup_n_s16(cospi_6_64);
452 d31s16 = vdup_n_s16(cospi_26_64);
454 q10s32 = vmull_s16(d28s16, d30s16);
455 q11s32 = vmull_s16(d29s16, d30s16);
456 q12s32 = vmull_s16(d28s16, d31s16);
457 q13s32 = vmull_s16(d29s16, d31s16);
459 q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
460 q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
461 q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
462 q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
464 d6s16 = vqrshrn_n_s32(q10s32, 14);
465 d7s16 = vqrshrn_n_s32(q11s32, 14);
466 d8s16 = vqrshrn_n_s32(q12s32, 14);
467 d9s16 = vqrshrn_n_s32(q13s32, 14);
468 q3s16 = vcombine_s16(d6s16, d7s16);
469 q4s16 = vcombine_s16(d8s16, d9s16);
472 q9s16 = vsubq_s16(q0s16, q1s16);
473 q0s16 = vaddq_s16(q0s16, q1s16);
474 q10s16 = vsubq_s16(q3s16, q2s16);
475 q11s16 = vaddq_s16(q2s16, q3s16);
476 q12s16 = vaddq_s16(q4s16, q5s16);
477 q13s16 = vsubq_s16(q4s16, q5s16);
478 q14s16 = vsubq_s16(q7s16, q6s16);
479 q7s16 = vaddq_s16(q6s16, q7s16);
482 d18s16 = vget_low_s16(q9s16);
483 d19s16 = vget_high_s16(q9s16);
484 d20s16 = vget_low_s16(q10s16);
485 d21s16 = vget_high_s16(q10s16);
486 d26s16 = vget_low_s16(q13s16);
487 d27s16 = vget_high_s16(q13s16);
488 d28s16 = vget_low_s16(q14s16);
489 d29s16 = vget_high_s16(q14s16);
491 d30s16 = vdup_n_s16(cospi_8_64);
492 d31s16 = vdup_n_s16(cospi_24_64);
494 q2s32 = vmull_s16(d18s16, d31s16);
495 q3s32 = vmull_s16(d19s16, d31s16);
496 q4s32 = vmull_s16(d28s16, d31s16);
497 q5s32 = vmull_s16(d29s16, d31s16);
499 q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
500 q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
501 q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
502 q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
504 d12s16 = vqrshrn_n_s32(q2s32, 14);
505 d13s16 = vqrshrn_n_s32(q3s32, 14);
506 d2s16 = vqrshrn_n_s32(q4s32, 14);
507 d3s16 = vqrshrn_n_s32(q5s32, 14);
508 q1s16 = vcombine_s16(d2s16, d3s16);
509 q6s16 = vcombine_s16(d12s16, d13s16);
514 d30s16 = vdup_n_s16(-cospi_8_64);
515 q11s32 = vmull_s16(d26s16, d30s16);
516 q12s32 = vmull_s16(d27s16, d30s16);
517 q8s32 = vmull_s16(d20s16, d30s16);
518 q9s32 = vmull_s16(d21s16, d30s16);
520 q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
521 q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
522 q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
523 q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
525 d4s16 = vqrshrn_n_s32(q11s32, 14);
526 d5s16 = vqrshrn_n_s32(q12s32, 14);
527 d10s16 = vqrshrn_n_s32(q8s32, 14);
528 d11s16 = vqrshrn_n_s32(q9s32, 14);
529 q2s16 = vcombine_s16(d4s16, d5s16);
530 q5s16 = vcombine_s16(d10s16, d11s16);
533 q8s16 = vaddq_s16(q0s16, q3s16);
534 q9s16 = vaddq_s16(q1s16, q2s16);
535 q10s16 = vsubq_s16(q1s16, q2s16);
536 q11s16 = vsubq_s16(q0s16, q3s16);
537 q12s16 = vsubq_s16(q7s16, q4s16);
538 q13s16 = vsubq_s16(q6s16, q5s16);
539 q14s16 = vaddq_s16(q6s16, q5s16);
540 q15s16 = vaddq_s16(q7s16, q4s16);
543 d20s16 = vget_low_s16(q10s16);
544 d21s16 = vget_high_s16(q10s16);
545 d22s16 = vget_low_s16(q11s16);
546 d23s16 = vget_high_s16(q11s16);
547 d24s16 = vget_low_s16(q12s16);
548 d25s16 = vget_high_s16(q12s16);
549 d26s16 = vget_low_s16(q13s16);
550 d27s16 = vget_high_s16(q13s16);
552 d14s16 = vdup_n_s16(cospi_16_64);
554 q3s32 = vmull_s16(d26s16, d14s16);
555 q4s32 = vmull_s16(d27s16, d14s16);
556 q0s32 = vmull_s16(d20s16, d14s16);
557 q1s32 = vmull_s16(d21s16, d14s16);
559 q5s32 = vsubq_s32(q3s32, q0s32);
560 q6s32 = vsubq_s32(q4s32, q1s32);
561 q10s32 = vaddq_s32(q3s32, q0s32);
562 q4s32 = vaddq_s32(q4s32, q1s32);
564 d4s16 = vqrshrn_n_s32(q5s32, 14);
565 d5s16 = vqrshrn_n_s32(q6s32, 14);
566 d10s16 = vqrshrn_n_s32(q10s32, 14);
567 d11s16 = vqrshrn_n_s32(q4s32, 14);
568 q2s16 = vcombine_s16(d4s16, d5s16);
569 q5s16 = vcombine_s16(d10s16, d11s16);
571 q0s32 = vmull_s16(d22s16, d14s16);
572 q1s32 = vmull_s16(d23s16, d14s16);
573 q13s32 = vmull_s16(d24s16, d14s16);
574 q6s32 = vmull_s16(d25s16, d14s16);
576 q10s32 = vsubq_s32(q13s32, q0s32);
577 q4s32 = vsubq_s32(q6s32, q1s32);
578 q13s32 = vaddq_s32(q13s32, q0s32);
579 q6s32 = vaddq_s32(q6s32, q1s32);
581 d6s16 = vqrshrn_n_s32(q10s32, 14);
582 d7s16 = vqrshrn_n_s32(q4s32, 14);
583 d8s16 = vqrshrn_n_s32(q13s32, 14);
584 d9s16 = vqrshrn_n_s32(q6s32, 14);
585 q3s16 = vcombine_s16(d6s16, d7s16);
586 q4s16 = vcombine_s16(d8s16, d9s16);
589 if (skip_adding != 0) {
591 // load the data in pass1
592 q0s16 = vld1q_s16(pass1Output);
594 q1s16 = vld1q_s16(pass1Output);
596 d12s64 = vld1_s64((int64_t *)dest);
598 d13s64 = vld1_s64((int64_t *)dest);
601 q12s16 = vaddq_s16(q0s16, q15s16);
602 q13s16 = vaddq_s16(q1s16, q14s16);
603 q12s16 = vrshrq_n_s16(q12s16, 6);
604 q13s16 = vrshrq_n_s16(q13s16, 6);
605 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
606 vreinterpret_u8_s64(d12s64));
607 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
608 vreinterpret_u8_s64(d13s64));
609 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
610 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
611 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
613 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
615 q14s16 = vsubq_s16(q1s16, q14s16);
616 q15s16 = vsubq_s16(q0s16, q15s16);
618 q10s16 = vld1q_s16(pass1Output);
620 q11s16 = vld1q_s16(pass1Output);
622 d12s64 = vld1_s64((int64_t *)dest);
624 d13s64 = vld1_s64((int64_t *)dest);
626 q12s16 = vaddq_s16(q10s16, q5s16);
627 q13s16 = vaddq_s16(q11s16, q4s16);
628 q12s16 = vrshrq_n_s16(q12s16, 6);
629 q13s16 = vrshrq_n_s16(q13s16, 6);
630 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
631 vreinterpret_u8_s64(d12s64));
632 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
633 vreinterpret_u8_s64(d13s64));
634 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
635 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
636 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
638 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
640 q4s16 = vsubq_s16(q11s16, q4s16);
641 q5s16 = vsubq_s16(q10s16, q5s16);
643 q0s16 = vld1q_s16(pass1Output);
645 q1s16 = vld1q_s16(pass1Output);
647 d12s64 = vld1_s64((int64_t *)dest);
649 d13s64 = vld1_s64((int64_t *)dest);
651 q12s16 = vaddq_s16(q0s16, q3s16);
652 q13s16 = vaddq_s16(q1s16, q2s16);
653 q12s16 = vrshrq_n_s16(q12s16, 6);
654 q13s16 = vrshrq_n_s16(q13s16, 6);
655 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
656 vreinterpret_u8_s64(d12s64));
657 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
658 vreinterpret_u8_s64(d13s64));
659 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
660 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
661 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
663 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
665 q2s16 = vsubq_s16(q1s16, q2s16);
666 q3s16 = vsubq_s16(q0s16, q3s16);
668 q10s16 = vld1q_s16(pass1Output);
670 q11s16 = vld1q_s16(pass1Output);
671 d12s64 = vld1_s64((int64_t *)dest);
673 d13s64 = vld1_s64((int64_t *)dest);
675 q12s16 = vaddq_s16(q10s16, q9s16);
676 q13s16 = vaddq_s16(q11s16, q8s16);
677 q12s16 = vrshrq_n_s16(q12s16, 6);
678 q13s16 = vrshrq_n_s16(q13s16, 6);
679 q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
680 vreinterpret_u8_s64(d12s64));
681 q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
682 vreinterpret_u8_s64(d13s64));
683 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
684 d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
685 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
687 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
689 q8s16 = vsubq_s16(q11s16, q8s16);
690 q9s16 = vsubq_s16(q10s16, q9s16);
692 // store the data out 8,9,10,11,12,13,14,15
693 d12s64 = vld1_s64((int64_t *)dest);
695 q8s16 = vrshrq_n_s16(q8s16, 6);
696 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
697 vreinterpret_u8_s64(d12s64));
698 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
699 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
702 d12s64 = vld1_s64((int64_t *)dest);
704 q9s16 = vrshrq_n_s16(q9s16, 6);
705 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
706 vreinterpret_u8_s64(d12s64));
707 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
708 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
711 d12s64 = vld1_s64((int64_t *)dest);
713 q2s16 = vrshrq_n_s16(q2s16, 6);
714 q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
715 vreinterpret_u8_s64(d12s64));
716 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
717 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
720 d12s64 = vld1_s64((int64_t *)dest);
722 q3s16 = vrshrq_n_s16(q3s16, 6);
723 q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
724 vreinterpret_u8_s64(d12s64));
725 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
726 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
729 d12s64 = vld1_s64((int64_t *)dest);
731 q4s16 = vrshrq_n_s16(q4s16, 6);
732 q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
733 vreinterpret_u8_s64(d12s64));
734 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
735 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
738 d12s64 = vld1_s64((int64_t *)dest);
740 q5s16 = vrshrq_n_s16(q5s16, 6);
741 q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
742 vreinterpret_u8_s64(d12s64));
743 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
744 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
747 d12s64 = vld1_s64((int64_t *)dest);
749 q14s16 = vrshrq_n_s16(q14s16, 6);
750 q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
751 vreinterpret_u8_s64(d12s64));
752 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
753 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
756 d12s64 = vld1_s64((int64_t *)dest);
757 q15s16 = vrshrq_n_s16(q15s16, 6);
758 q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
759 vreinterpret_u8_s64(d12s64));
760 d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
761 vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
762 } else { // skip_adding_dest
763 q0s16 = vld1q_s16(pass1Output);
765 q1s16 = vld1q_s16(pass1Output);
767 q12s16 = vaddq_s16(q0s16, q15s16);
768 q13s16 = vaddq_s16(q1s16, q14s16);
769 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
770 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
771 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
772 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
773 vst1_u64((uint64_t *)out, d24u64);
775 vst1_u64((uint64_t *)out, d25u64);
777 vst1_u64((uint64_t *)out, d26u64);
779 vst1_u64((uint64_t *)out, d27u64);
781 q14s16 = vsubq_s16(q1s16, q14s16);
782 q15s16 = vsubq_s16(q0s16, q15s16);
784 q10s16 = vld1q_s16(pass1Output);
786 q11s16 = vld1q_s16(pass1Output);
788 q12s16 = vaddq_s16(q10s16, q5s16);
789 q13s16 = vaddq_s16(q11s16, q4s16);
790 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
791 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
792 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
793 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
794 vst1_u64((uint64_t *)out, d24u64);
796 vst1_u64((uint64_t *)out, d25u64);
798 vst1_u64((uint64_t *)out, d26u64);
800 vst1_u64((uint64_t *)out, d27u64);
802 q4s16 = vsubq_s16(q11s16, q4s16);
803 q5s16 = vsubq_s16(q10s16, q5s16);
805 q0s16 = vld1q_s16(pass1Output);
807 q1s16 = vld1q_s16(pass1Output);
809 q12s16 = vaddq_s16(q0s16, q3s16);
810 q13s16 = vaddq_s16(q1s16, q2s16);
811 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
812 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
813 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
814 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
815 vst1_u64((uint64_t *)out, d24u64);
817 vst1_u64((uint64_t *)out, d25u64);
819 vst1_u64((uint64_t *)out, d26u64);
821 vst1_u64((uint64_t *)out, d27u64);
823 q2s16 = vsubq_s16(q1s16, q2s16);
824 q3s16 = vsubq_s16(q0s16, q3s16);
826 q10s16 = vld1q_s16(pass1Output);
828 q11s16 = vld1q_s16(pass1Output);
830 q12s16 = vaddq_s16(q10s16, q9s16);
831 q13s16 = vaddq_s16(q11s16, q8s16);
832 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
833 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
834 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
835 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
836 vst1_u64((uint64_t *)out, d24u64);
838 vst1_u64((uint64_t *)out, d25u64);
840 vst1_u64((uint64_t *)out, d26u64);
842 vst1_u64((uint64_t *)out, d27u64);
844 q8s16 = vsubq_s16(q11s16, q8s16);
845 q9s16 = vsubq_s16(q10s16, q9s16);
847 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
849 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
851 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
853 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
855 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
857 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
859 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
861 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
863 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
865 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
867 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
869 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
871 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
873 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
875 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
877 vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
882 void vpx_idct16x16_10_add_neon_pass1(
887 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
888 uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
889 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
890 int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
891 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
892 int32x4_t q6s32, q9s32;
893 int32x4_t q10s32, q11s32, q12s32, q15s32;
896 q0x2s16 = vld2q_s16(in);
897 q8s16 = q0x2s16.val[0];
899 q0x2s16 = vld2q_s16(in);
900 q9s16 = q0x2s16.val[0];
902 q0x2s16 = vld2q_s16(in);
903 q10s16 = q0x2s16.val[0];
905 q0x2s16 = vld2q_s16(in);
906 q11s16 = q0x2s16.val[0];
908 q0x2s16 = vld2q_s16(in);
909 q12s16 = q0x2s16.val[0];
911 q0x2s16 = vld2q_s16(in);
912 q13s16 = q0x2s16.val[0];
914 q0x2s16 = vld2q_s16(in);
915 q14s16 = q0x2s16.val[0];
917 q0x2s16 = vld2q_s16(in);
918 q15s16 = q0x2s16.val[0];
920 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
921 &q12s16, &q13s16, &q14s16, &q15s16);
924 q0s16 = vdupq_n_s16(cospi_28_64 * 2);
925 q1s16 = vdupq_n_s16(cospi_4_64 * 2);
927 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
928 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
931 q1s16 = vdupq_n_s16(cospi_16_64 * 2);
932 d4s16 = vdup_n_s16(cospi_16_64);
934 q8s16 = vqrdmulhq_s16(q8s16, q1s16);
936 d8s16 = vget_low_s16(q4s16);
937 d9s16 = vget_high_s16(q4s16);
938 d14s16 = vget_low_s16(q7s16);
939 d15s16 = vget_high_s16(q7s16);
940 q9s32 = vmull_s16(d14s16, d4s16);
941 q10s32 = vmull_s16(d15s16, d4s16);
942 q12s32 = vmull_s16(d9s16, d4s16);
943 q11s32 = vmull_s16(d8s16, d4s16);
945 q15s32 = vsubq_s32(q10s32, q12s32);
946 q6s32 = vsubq_s32(q9s32, q11s32);
947 q9s32 = vaddq_s32(q9s32, q11s32);
948 q10s32 = vaddq_s32(q10s32, q12s32);
950 d11s16 = vqrshrn_n_s32(q15s32, 14);
951 d10s16 = vqrshrn_n_s32(q6s32, 14);
952 d12s16 = vqrshrn_n_s32(q9s32, 14);
953 d13s16 = vqrshrn_n_s32(q10s32, 14);
954 q5s16 = vcombine_s16(d10s16, d11s16);
955 q6s16 = vcombine_s16(d12s16, d13s16);
958 q2s16 = vaddq_s16(q8s16, q7s16);
959 q9s16 = vaddq_s16(q8s16, q6s16);
960 q10s16 = vaddq_s16(q8s16, q5s16);
961 q11s16 = vaddq_s16(q8s16, q4s16);
962 q12s16 = vsubq_s16(q8s16, q4s16);
963 q13s16 = vsubq_s16(q8s16, q5s16);
964 q14s16 = vsubq_s16(q8s16, q6s16);
965 q15s16 = vsubq_s16(q8s16, q7s16);
967 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
968 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
969 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
970 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
971 d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
972 d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
973 d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
974 d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
975 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
976 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
977 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
978 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
979 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
980 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
981 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
982 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
985 output_stride >>= 1; // output_stride / 2, out is int16_t
986 vst1_u64((uint64_t *)out, d4u64);
987 out += output_stride;
988 vst1_u64((uint64_t *)out, d5u64);
989 out += output_stride;
990 vst1_u64((uint64_t *)out, d18u64);
991 out += output_stride;
992 vst1_u64((uint64_t *)out, d19u64);
993 out += output_stride;
994 vst1_u64((uint64_t *)out, d20u64);
995 out += output_stride;
996 vst1_u64((uint64_t *)out, d21u64);
997 out += output_stride;
998 vst1_u64((uint64_t *)out, d22u64);
999 out += output_stride;
1000 vst1_u64((uint64_t *)out, d23u64);
1001 out += output_stride;
1002 vst1_u64((uint64_t *)out, d24u64);
1003 out += output_stride;
1004 vst1_u64((uint64_t *)out, d25u64);
1005 out += output_stride;
1006 vst1_u64((uint64_t *)out, d26u64);
1007 out += output_stride;
1008 vst1_u64((uint64_t *)out, d27u64);
1009 out += output_stride;
1010 vst1_u64((uint64_t *)out, d28u64);
1011 out += output_stride;
1012 vst1_u64((uint64_t *)out, d29u64);
1013 out += output_stride;
1014 vst1_u64((uint64_t *)out, d30u64);
1015 out += output_stride;
1016 vst1_u64((uint64_t *)out, d31u64);
1020 void vpx_idct16x16_10_add_neon_pass2(
1023 int16_t *pass1Output,
1024 int16_t skip_adding,
1027 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
1028 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
1029 int16x4_t d20s16, d21s16, d22s16, d23s16;
1030 int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
1031 uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
1032 uint64x1_t d16u64, d17u64, d18u64, d19u64;
1033 uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
1034 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
1035 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
1036 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
1037 int32x4_t q10s32, q11s32, q12s32, q13s32;
1038 int16x8x2_t q0x2s16;
1043 q0x2s16 = vld2q_s16(src);
1044 q8s16 = q0x2s16.val[0];
1046 q0x2s16 = vld2q_s16(src);
1047 q9s16 = q0x2s16.val[0];
1049 q0x2s16 = vld2q_s16(src);
1050 q10s16 = q0x2s16.val[0];
1052 q0x2s16 = vld2q_s16(src);
1053 q11s16 = q0x2s16.val[0];
1055 q0x2s16 = vld2q_s16(src);
1056 q12s16 = q0x2s16.val[0];
1058 q0x2s16 = vld2q_s16(src);
1059 q13s16 = q0x2s16.val[0];
1061 q0x2s16 = vld2q_s16(src);
1062 q14s16 = q0x2s16.val[0];
1064 q0x2s16 = vld2q_s16(src);
1065 q15s16 = q0x2s16.val[0];
1067 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
1068 &q12s16, &q13s16, &q14s16, &q15s16);
1071 q6s16 = vdupq_n_s16(cospi_30_64 * 2);
1072 q0s16 = vqrdmulhq_s16(q8s16, q6s16);
1073 q6s16 = vdupq_n_s16(cospi_2_64 * 2);
1074 q7s16 = vqrdmulhq_s16(q8s16, q6s16);
1076 q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
1077 q14s16 = vdupq_n_s16(cospi_6_64 * 2);
1078 q3s16 = vqrdmulhq_s16(q9s16, q15s16);
1079 q4s16 = vqrdmulhq_s16(q9s16, q14s16);
1082 d0s16 = vget_low_s16(q0s16);
1083 d1s16 = vget_high_s16(q0s16);
1084 d6s16 = vget_low_s16(q3s16);
1085 d7s16 = vget_high_s16(q3s16);
1086 d8s16 = vget_low_s16(q4s16);
1087 d9s16 = vget_high_s16(q4s16);
1088 d14s16 = vget_low_s16(q7s16);
1089 d15s16 = vget_high_s16(q7s16);
1091 d30s16 = vdup_n_s16(cospi_8_64);
1092 d31s16 = vdup_n_s16(cospi_24_64);
1094 q12s32 = vmull_s16(d14s16, d31s16);
1095 q5s32 = vmull_s16(d15s16, d31s16);
1096 q2s32 = vmull_s16(d0s16, d31s16);
1097 q11s32 = vmull_s16(d1s16, d31s16);
1099 q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
1100 q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
1101 q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
1102 q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
1104 d2s16 = vqrshrn_n_s32(q12s32, 14);
1105 d3s16 = vqrshrn_n_s32(q5s32, 14);
1106 d12s16 = vqrshrn_n_s32(q2s32, 14);
1107 d13s16 = vqrshrn_n_s32(q11s32, 14);
1108 q1s16 = vcombine_s16(d2s16, d3s16);
1109 q6s16 = vcombine_s16(d12s16, d13s16);
1111 d30s16 = vdup_n_s16(-cospi_8_64);
1112 q10s32 = vmull_s16(d8s16, d30s16);
1113 q13s32 = vmull_s16(d9s16, d30s16);
1114 q8s32 = vmull_s16(d6s16, d30s16);
1115 q9s32 = vmull_s16(d7s16, d30s16);
1117 q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
1118 q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
1119 q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
1120 q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
1122 d4s16 = vqrshrn_n_s32(q10s32, 14);
1123 d5s16 = vqrshrn_n_s32(q13s32, 14);
1124 d10s16 = vqrshrn_n_s32(q8s32, 14);
1125 d11s16 = vqrshrn_n_s32(q9s32, 14);
1126 q2s16 = vcombine_s16(d4s16, d5s16);
1127 q5s16 = vcombine_s16(d10s16, d11s16);
1130 q8s16 = vaddq_s16(q0s16, q3s16);
1131 q9s16 = vaddq_s16(q1s16, q2s16);
1132 q10s16 = vsubq_s16(q1s16, q2s16);
1133 q11s16 = vsubq_s16(q0s16, q3s16);
1134 q12s16 = vsubq_s16(q7s16, q4s16);
1135 q13s16 = vsubq_s16(q6s16, q5s16);
1136 q14s16 = vaddq_s16(q6s16, q5s16);
1137 q15s16 = vaddq_s16(q7s16, q4s16);
1140 d20s16 = vget_low_s16(q10s16);
1141 d21s16 = vget_high_s16(q10s16);
1142 d22s16 = vget_low_s16(q11s16);
1143 d23s16 = vget_high_s16(q11s16);
1144 d24s16 = vget_low_s16(q12s16);
1145 d25s16 = vget_high_s16(q12s16);
1146 d26s16 = vget_low_s16(q13s16);
1147 d27s16 = vget_high_s16(q13s16);
1149 d14s16 = vdup_n_s16(cospi_16_64);
1150 q3s32 = vmull_s16(d26s16, d14s16);
1151 q4s32 = vmull_s16(d27s16, d14s16);
1152 q0s32 = vmull_s16(d20s16, d14s16);
1153 q1s32 = vmull_s16(d21s16, d14s16);
1155 q5s32 = vsubq_s32(q3s32, q0s32);
1156 q6s32 = vsubq_s32(q4s32, q1s32);
1157 q0s32 = vaddq_s32(q3s32, q0s32);
1158 q4s32 = vaddq_s32(q4s32, q1s32);
1160 d4s16 = vqrshrn_n_s32(q5s32, 14);
1161 d5s16 = vqrshrn_n_s32(q6s32, 14);
1162 d10s16 = vqrshrn_n_s32(q0s32, 14);
1163 d11s16 = vqrshrn_n_s32(q4s32, 14);
1164 q2s16 = vcombine_s16(d4s16, d5s16);
1165 q5s16 = vcombine_s16(d10s16, d11s16);
1167 q0s32 = vmull_s16(d22s16, d14s16);
1168 q1s32 = vmull_s16(d23s16, d14s16);
1169 q13s32 = vmull_s16(d24s16, d14s16);
1170 q6s32 = vmull_s16(d25s16, d14s16);
1172 q10s32 = vsubq_s32(q13s32, q0s32);
1173 q4s32 = vsubq_s32(q6s32, q1s32);
1174 q13s32 = vaddq_s32(q13s32, q0s32);
1175 q6s32 = vaddq_s32(q6s32, q1s32);
1177 d6s16 = vqrshrn_n_s32(q10s32, 14);
1178 d7s16 = vqrshrn_n_s32(q4s32, 14);
1179 d8s16 = vqrshrn_n_s32(q13s32, 14);
1180 d9s16 = vqrshrn_n_s32(q6s32, 14);
1181 q3s16 = vcombine_s16(d6s16, d7s16);
1182 q4s16 = vcombine_s16(d8s16, d9s16);
1185 q0s16 = vld1q_s16(pass1Output);
1187 q1s16 = vld1q_s16(pass1Output);
1189 q12s16 = vaddq_s16(q0s16, q15s16);
1190 q13s16 = vaddq_s16(q1s16, q14s16);
1191 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1192 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1193 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1194 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1195 vst1_u64((uint64_t *)out, d24u64);
1197 vst1_u64((uint64_t *)out, d25u64);
1199 vst1_u64((uint64_t *)out, d26u64);
1201 vst1_u64((uint64_t *)out, d27u64);
1203 q14s16 = vsubq_s16(q1s16, q14s16);
1204 q15s16 = vsubq_s16(q0s16, q15s16);
1206 q10s16 = vld1q_s16(pass1Output);
1208 q11s16 = vld1q_s16(pass1Output);
1210 q12s16 = vaddq_s16(q10s16, q5s16);
1211 q13s16 = vaddq_s16(q11s16, q4s16);
1212 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1213 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1214 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1215 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1216 vst1_u64((uint64_t *)out, d24u64);
1218 vst1_u64((uint64_t *)out, d25u64);
1220 vst1_u64((uint64_t *)out, d26u64);
1222 vst1_u64((uint64_t *)out, d27u64);
1224 q4s16 = vsubq_s16(q11s16, q4s16);
1225 q5s16 = vsubq_s16(q10s16, q5s16);
1227 q0s16 = vld1q_s16(pass1Output);
1229 q1s16 = vld1q_s16(pass1Output);
1231 q12s16 = vaddq_s16(q0s16, q3s16);
1232 q13s16 = vaddq_s16(q1s16, q2s16);
1233 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1234 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1235 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1236 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1237 vst1_u64((uint64_t *)out, d24u64);
1239 vst1_u64((uint64_t *)out, d25u64);
1241 vst1_u64((uint64_t *)out, d26u64);
1243 vst1_u64((uint64_t *)out, d27u64);
1245 q2s16 = vsubq_s16(q1s16, q2s16);
1246 q3s16 = vsubq_s16(q0s16, q3s16);
1248 q10s16 = vld1q_s16(pass1Output);
1250 q11s16 = vld1q_s16(pass1Output);
1251 q12s16 = vaddq_s16(q10s16, q9s16);
1252 q13s16 = vaddq_s16(q11s16, q8s16);
1253 d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
1254 d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
1255 d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
1256 d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
1257 vst1_u64((uint64_t *)out, d24u64);
1259 vst1_u64((uint64_t *)out, d25u64);
1261 vst1_u64((uint64_t *)out, d26u64);
1263 vst1_u64((uint64_t *)out, d27u64);
1265 q8s16 = vsubq_s16(q11s16, q8s16);
1266 q9s16 = vsubq_s16(q10s16, q9s16);
1268 d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
1269 d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
1270 d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16));
1271 d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16));
1272 d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16));
1273 d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16));
1274 d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
1275 d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
1276 d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
1277 d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
1278 d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
1279 d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
1280 d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
1281 d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
1282 d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
1283 d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
1285 vst1_u64((uint64_t *)out, d16u64);
1287 vst1_u64((uint64_t *)out, d17u64);
1289 vst1_u64((uint64_t *)out, d18u64);
1291 vst1_u64((uint64_t *)out, d19u64);
1293 vst1_u64((uint64_t *)out, d4u64);
1295 vst1_u64((uint64_t *)out, d5u64);
1297 vst1_u64((uint64_t *)out, d6u64);
1299 vst1_u64((uint64_t *)out, d7u64);
1301 vst1_u64((uint64_t *)out, d8u64);
1303 vst1_u64((uint64_t *)out, d9u64);
1305 vst1_u64((uint64_t *)out, d10u64);
1307 vst1_u64((uint64_t *)out, d11u64);
1309 vst1_u64((uint64_t *)out, d28u64);
1311 vst1_u64((uint64_t *)out, d29u64);
1313 vst1_u64((uint64_t *)out, d30u64);
1315 vst1_u64((uint64_t *)out, d31u64);