2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/arm/transpose_neon.h"
16 #include "vpx_dsp/txfm_common.h"
18 #define LOAD_FROM_TRANSPOSED(prev, first, second) \
19 q14s16 = vld1q_s16(trans_buf + first * 8); \
20 q13s16 = vld1q_s16(trans_buf + second * 8);
22 #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
23 qA = vld1q_s16(out + first * 32); \
24 qB = vld1q_s16(out + second * 32);
26 #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
27 vst1q_s16(out + first * 32, qA); \
28 vst1q_s16(out + second * 32, qB);
30 #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
31 __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
32 static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
33 int stride, int16x8_t q6s16,
37 int16x4_t d8s16, d9s16, d10s16, d11s16;
39 d8s16 = vld1_s16((int16_t *)p1);
41 d11s16 = vld1_s16((int16_t *)p2);
43 d9s16 = vld1_s16((int16_t *)p1);
44 d10s16 = vld1_s16((int16_t *)p2);
46 q7s16 = vrshrq_n_s16(q7s16, 6);
47 q8s16 = vrshrq_n_s16(q8s16, 6);
48 q9s16 = vrshrq_n_s16(q9s16, 6);
49 q6s16 = vrshrq_n_s16(q6s16, 6);
51 q7s16 = vreinterpretq_s16_u16(
52 vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
53 q8s16 = vreinterpretq_s16_u16(
54 vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
55 q9s16 = vreinterpretq_s16_u16(
56 vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
57 q6s16 = vreinterpretq_s16_u16(
58 vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
60 d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
61 d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
62 d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
63 d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
65 vst1_s16((int16_t *)p1, d9s16);
67 vst1_s16((int16_t *)p2, d10s16);
69 vst1_s16((int16_t *)p1, d8s16);
70 vst1_s16((int16_t *)p2, d11s16);
73 #define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
74 __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
75 static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
76 int stride, int16x8_t q4s16,
80 int16x4_t d4s16, d5s16, d6s16, d7s16;
82 d4s16 = vld1_s16((int16_t *)p1);
84 d7s16 = vld1_s16((int16_t *)p2);
86 d5s16 = vld1_s16((int16_t *)p1);
87 d6s16 = vld1_s16((int16_t *)p2);
89 q5s16 = vrshrq_n_s16(q5s16, 6);
90 q6s16 = vrshrq_n_s16(q6s16, 6);
91 q7s16 = vrshrq_n_s16(q7s16, 6);
92 q4s16 = vrshrq_n_s16(q4s16, 6);
94 q5s16 = vreinterpretq_s16_u16(
95 vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
96 q6s16 = vreinterpretq_s16_u16(
97 vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
98 q7s16 = vreinterpretq_s16_u16(
99 vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
100 q4s16 = vreinterpretq_s16_u16(
101 vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
103 d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
104 d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
105 d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
106 d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
108 vst1_s16((int16_t *)p1, d5s16);
110 vst1_s16((int16_t *)p2, d6s16);
112 vst1_s16((int16_t *)p2, d7s16);
113 vst1_s16((int16_t *)p1, d4s16);
116 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
117 DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
118 static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
119 int16_t first_const, int16_t second_const,
120 int16x8_t *qAs16, int16x8_t *qBs16) {
121 int16x4_t d30s16, d31s16;
122 int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
123 int16x4_t dCs16, dDs16, dAs16, dBs16;
125 dCs16 = vget_low_s16(q14s16);
126 dDs16 = vget_high_s16(q14s16);
127 dAs16 = vget_low_s16(q13s16);
128 dBs16 = vget_high_s16(q13s16);
130 d30s16 = vdup_n_s16(first_const);
131 d31s16 = vdup_n_s16(second_const);
133 q8s32 = vmull_s16(dCs16, d30s16);
134 q10s32 = vmull_s16(dAs16, d31s16);
135 q9s32 = vmull_s16(dDs16, d30s16);
136 q11s32 = vmull_s16(dBs16, d31s16);
137 q12s32 = vmull_s16(dCs16, d31s16);
139 q8s32 = vsubq_s32(q8s32, q10s32);
140 q9s32 = vsubq_s32(q9s32, q11s32);
142 q10s32 = vmull_s16(dDs16, d31s16);
143 q11s32 = vmull_s16(dAs16, d30s16);
144 q15s32 = vmull_s16(dBs16, d30s16);
146 q11s32 = vaddq_s32(q12s32, q11s32);
147 q10s32 = vaddq_s32(q10s32, q15s32);
149 *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
150 *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
153 static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
156 const int stride = 32;
157 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
159 for (i = 0; i < 4; i++, input += 8) {
161 q8s16 = vld1q_s16(in);
163 q9s16 = vld1q_s16(in);
165 q10s16 = vld1q_s16(in);
167 q11s16 = vld1q_s16(in);
169 q12s16 = vld1q_s16(in);
171 q13s16 = vld1q_s16(in);
173 q14s16 = vld1q_s16(in);
175 q15s16 = vld1q_s16(in);
177 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
180 vst1q_s16(t_buf, q8s16);
182 vst1q_s16(t_buf, q9s16);
184 vst1q_s16(t_buf, q10s16);
186 vst1q_s16(t_buf, q11s16);
188 vst1q_s16(t_buf, q12s16);
190 vst1q_s16(t_buf, q13s16);
192 vst1q_s16(t_buf, q14s16);
194 vst1q_s16(t_buf, q15s16);
199 static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
200 int16x8_t q3s16, int16x8_t q6s16,
201 int16x8_t q7s16, int16x8_t q8s16,
202 int16x8_t q9s16, int16x8_t q10s16,
203 int16x8_t q11s16, int16x8_t q12s16,
204 int16x8_t q13s16, int16x8_t q14s16,
206 int16x8_t q0s16, q1s16, q4s16, q5s16;
208 STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
209 STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
211 LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
212 q4s16 = vaddq_s16(q2s16, q1s16);
213 q5s16 = vaddq_s16(q3s16, q0s16);
214 q6s16 = vsubq_s16(q3s16, q0s16);
215 q7s16 = vsubq_s16(q2s16, q1s16);
216 STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
217 STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
219 LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
220 q2s16 = vaddq_s16(q10s16, q1s16);
221 q3s16 = vaddq_s16(q11s16, q0s16);
222 q4s16 = vsubq_s16(q11s16, q0s16);
223 q5s16 = vsubq_s16(q10s16, q1s16);
225 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
226 q8s16 = vaddq_s16(q4s16, q1s16);
227 q9s16 = vaddq_s16(q5s16, q0s16);
228 q6s16 = vsubq_s16(q5s16, q0s16);
229 q7s16 = vsubq_s16(q4s16, q1s16);
230 STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
231 STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
233 LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
234 q4s16 = vaddq_s16(q2s16, q1s16);
235 q5s16 = vaddq_s16(q3s16, q0s16);
236 q6s16 = vsubq_s16(q3s16, q0s16);
237 q7s16 = vsubq_s16(q2s16, q1s16);
238 STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
239 STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
241 LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
242 q2s16 = vaddq_s16(q12s16, q1s16);
243 q3s16 = vaddq_s16(q13s16, q0s16);
244 q4s16 = vsubq_s16(q13s16, q0s16);
245 q5s16 = vsubq_s16(q12s16, q1s16);
247 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
248 q8s16 = vaddq_s16(q4s16, q1s16);
249 q9s16 = vaddq_s16(q5s16, q0s16);
250 q6s16 = vsubq_s16(q5s16, q0s16);
251 q7s16 = vsubq_s16(q4s16, q1s16);
252 STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
253 STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
255 LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
256 q4s16 = vaddq_s16(q2s16, q1s16);
257 q5s16 = vaddq_s16(q3s16, q0s16);
258 q6s16 = vsubq_s16(q3s16, q0s16);
259 q7s16 = vsubq_s16(q2s16, q1s16);
260 STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
261 STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
263 LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
264 q2s16 = vaddq_s16(q14s16, q1s16);
265 q3s16 = vaddq_s16(q15s16, q0s16);
266 q4s16 = vsubq_s16(q15s16, q0s16);
267 q5s16 = vsubq_s16(q14s16, q1s16);
269 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
270 q8s16 = vaddq_s16(q4s16, q1s16);
271 q9s16 = vaddq_s16(q5s16, q0s16);
272 q6s16 = vsubq_s16(q5s16, q0s16);
273 q7s16 = vsubq_s16(q4s16, q1s16);
274 STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
275 STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
277 LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
278 q4s16 = vaddq_s16(q2s16, q1s16);
279 q5s16 = vaddq_s16(q3s16, q0s16);
280 q6s16 = vsubq_s16(q3s16, q0s16);
281 q7s16 = vsubq_s16(q2s16, q1s16);
282 STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
283 STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
286 static INLINE void idct32_bands_end_2nd_pass(
287 int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
288 int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
289 int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
290 int16x8_t q14s16, int16x8_t q15s16) {
291 uint8_t *r6 = dest + 31 * stride;
292 uint8_t *r7 = dest /* + 0 * stride*/;
293 uint8_t *r9 = dest + 15 * stride;
294 uint8_t *r10 = dest + 16 * stride;
295 int str2 = stride << 1;
296 int16x8_t q0s16, q1s16, q4s16, q5s16;
298 STORE_COMBINE_CENTER_RESULTS(r10, r9);
302 LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
303 q4s16 = vaddq_s16(q2s16, q1s16);
304 q5s16 = vaddq_s16(q3s16, q0s16);
305 q6s16 = vsubq_s16(q3s16, q0s16);
306 q7s16 = vsubq_s16(q2s16, q1s16);
307 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
311 LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
312 q2s16 = vaddq_s16(q10s16, q1s16);
313 q3s16 = vaddq_s16(q11s16, q0s16);
314 q4s16 = vsubq_s16(q11s16, q0s16);
315 q5s16 = vsubq_s16(q10s16, q1s16);
317 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
318 q8s16 = vaddq_s16(q4s16, q1s16);
319 q9s16 = vaddq_s16(q5s16, q0s16);
320 q6s16 = vsubq_s16(q5s16, q0s16);
321 q7s16 = vsubq_s16(q4s16, q1s16);
322 STORE_COMBINE_CENTER_RESULTS(r10, r9);
326 LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
327 q4s16 = vaddq_s16(q2s16, q1s16);
328 q5s16 = vaddq_s16(q3s16, q0s16);
329 q6s16 = vsubq_s16(q3s16, q0s16);
330 q7s16 = vsubq_s16(q2s16, q1s16);
331 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
335 LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
336 q2s16 = vaddq_s16(q12s16, q1s16);
337 q3s16 = vaddq_s16(q13s16, q0s16);
338 q4s16 = vsubq_s16(q13s16, q0s16);
339 q5s16 = vsubq_s16(q12s16, q1s16);
341 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
342 q8s16 = vaddq_s16(q4s16, q1s16);
343 q9s16 = vaddq_s16(q5s16, q0s16);
344 q6s16 = vsubq_s16(q5s16, q0s16);
345 q7s16 = vsubq_s16(q4s16, q1s16);
346 STORE_COMBINE_CENTER_RESULTS(r10, r9);
350 LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
351 q4s16 = vaddq_s16(q2s16, q1s16);
352 q5s16 = vaddq_s16(q3s16, q0s16);
353 q6s16 = vsubq_s16(q3s16, q0s16);
354 q7s16 = vsubq_s16(q2s16, q1s16);
355 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
359 LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
360 q2s16 = vaddq_s16(q14s16, q1s16);
361 q3s16 = vaddq_s16(q15s16, q0s16);
362 q4s16 = vsubq_s16(q15s16, q0s16);
363 q5s16 = vsubq_s16(q14s16, q1s16);
365 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
366 q8s16 = vaddq_s16(q4s16, q1s16);
367 q9s16 = vaddq_s16(q5s16, q0s16);
368 q6s16 = vsubq_s16(q5s16, q0s16);
369 q7s16 = vsubq_s16(q4s16, q1s16);
370 STORE_COMBINE_CENTER_RESULTS(r10, r9);
372 LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
373 q4s16 = vaddq_s16(q2s16, q1s16);
374 q5s16 = vaddq_s16(q3s16, q0s16);
375 q6s16 = vsubq_s16(q3s16, q0s16);
376 q7s16 = vsubq_s16(q2s16, q1s16);
377 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
380 void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
382 int i, idct32_pass_loop;
383 int16_t trans_buf[32 * 8];
384 int16_t pass1[32 * 32];
385 int16_t pass2[32 * 32];
387 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
388 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
390 for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
392 input = pass1, // the input of pass2 is the result of pass1
394 for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
395 idct32_transpose_pair(input, trans_buf);
397 // -----------------------------------------
398 // BLOCK A: 16-19,28-31
399 // -----------------------------------------
400 // generate 16,17,30,31
402 LOAD_FROM_TRANSPOSED(0, 1, 31)
403 DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
404 LOAD_FROM_TRANSPOSED(31, 17, 15)
405 DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
407 q4s16 = vaddq_s16(q0s16, q1s16);
408 q13s16 = vsubq_s16(q0s16, q1s16);
409 q6s16 = vaddq_s16(q2s16, q3s16);
410 q14s16 = vsubq_s16(q2s16, q3s16);
412 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
414 // generate 18,19,28,29
416 LOAD_FROM_TRANSPOSED(15, 9, 23)
417 DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
418 LOAD_FROM_TRANSPOSED(23, 25, 7)
419 DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
421 q13s16 = vsubq_s16(q3s16, q2s16);
422 q3s16 = vaddq_s16(q3s16, q2s16);
423 q14s16 = vsubq_s16(q1s16, q0s16);
424 q2s16 = vaddq_s16(q1s16, q0s16);
426 DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
428 q8s16 = vaddq_s16(q4s16, q2s16);
429 q9s16 = vaddq_s16(q5s16, q0s16);
430 q10s16 = vaddq_s16(q7s16, q1s16);
431 q15s16 = vaddq_s16(q6s16, q3s16);
432 q13s16 = vsubq_s16(q5s16, q0s16);
433 q14s16 = vsubq_s16(q7s16, q1s16);
434 STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
435 STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
437 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
438 STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
440 q13s16 = vsubq_s16(q4s16, q2s16);
441 q14s16 = vsubq_s16(q6s16, q3s16);
443 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
444 STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
446 // -----------------------------------------
447 // BLOCK B: 20-23,24-27
448 // -----------------------------------------
449 // generate 20,21,26,27
451 LOAD_FROM_TRANSPOSED(7, 5, 27)
452 DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
453 LOAD_FROM_TRANSPOSED(27, 21, 11)
454 DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
456 q13s16 = vsubq_s16(q0s16, q1s16);
457 q0s16 = vaddq_s16(q0s16, q1s16);
458 q14s16 = vsubq_s16(q2s16, q3s16);
459 q2s16 = vaddq_s16(q2s16, q3s16);
461 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
463 // generate 22,23,24,25
465 LOAD_FROM_TRANSPOSED(11, 13, 19)
466 DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
467 LOAD_FROM_TRANSPOSED(19, 29, 3)
468 DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
470 q14s16 = vsubq_s16(q4s16, q5s16);
471 q5s16 = vaddq_s16(q4s16, q5s16);
472 q13s16 = vsubq_s16(q6s16, q7s16);
473 q6s16 = vaddq_s16(q6s16, q7s16);
475 DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
477 q10s16 = vaddq_s16(q7s16, q1s16);
478 q11s16 = vaddq_s16(q5s16, q0s16);
479 q12s16 = vaddq_s16(q6s16, q2s16);
480 q15s16 = vaddq_s16(q4s16, q3s16);
482 LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
483 q8s16 = vaddq_s16(q14s16, q11s16);
484 q9s16 = vaddq_s16(q13s16, q10s16);
485 q13s16 = vsubq_s16(q13s16, q10s16);
486 q11s16 = vsubq_s16(q14s16, q11s16);
487 STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
488 LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
489 q8s16 = vsubq_s16(q9s16, q12s16);
490 q10s16 = vaddq_s16(q14s16, q15s16);
491 q14s16 = vsubq_s16(q14s16, q15s16);
492 q12s16 = vaddq_s16(q9s16, q12s16);
493 STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
495 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
496 STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
499 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
500 STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
502 q14s16 = vsubq_s16(q5s16, q0s16);
503 q13s16 = vsubq_s16(q6s16, q2s16);
504 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
505 q14s16 = vsubq_s16(q7s16, q1s16);
506 q13s16 = vsubq_s16(q4s16, q3s16);
507 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
509 LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
510 q8s16 = vaddq_s16(q14s16, q1s16);
511 q9s16 = vaddq_s16(q13s16, q6s16);
512 q13s16 = vsubq_s16(q13s16, q6s16);
513 q1s16 = vsubq_s16(q14s16, q1s16);
514 STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
515 LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
516 q14s16 = vsubq_s16(q8s16, q5s16);
517 q10s16 = vaddq_s16(q8s16, q5s16);
518 q11s16 = vaddq_s16(q9s16, q0s16);
519 q0s16 = vsubq_s16(q9s16, q0s16);
520 STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
522 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
523 STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
524 DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
525 STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
527 // -----------------------------------------
528 // BLOCK C: 8-10,11-15
529 // -----------------------------------------
530 // generate 8,9,14,15
532 LOAD_FROM_TRANSPOSED(3, 2, 30)
533 DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
534 LOAD_FROM_TRANSPOSED(30, 18, 14)
535 DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
537 q13s16 = vsubq_s16(q0s16, q1s16);
538 q0s16 = vaddq_s16(q0s16, q1s16);
539 q14s16 = vsubq_s16(q2s16, q3s16);
540 q2s16 = vaddq_s16(q2s16, q3s16);
542 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
544 // generate 10,11,12,13
546 LOAD_FROM_TRANSPOSED(14, 10, 22)
547 DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
548 LOAD_FROM_TRANSPOSED(22, 26, 6)
549 DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
551 q14s16 = vsubq_s16(q4s16, q5s16);
552 q5s16 = vaddq_s16(q4s16, q5s16);
553 q13s16 = vsubq_s16(q6s16, q7s16);
554 q6s16 = vaddq_s16(q6s16, q7s16);
556 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
558 q8s16 = vaddq_s16(q0s16, q5s16);
559 q9s16 = vaddq_s16(q1s16, q7s16);
560 q13s16 = vsubq_s16(q1s16, q7s16);
561 q14s16 = vsubq_s16(q3s16, q4s16);
562 q10s16 = vaddq_s16(q3s16, q4s16);
563 q15s16 = vaddq_s16(q2s16, q6s16);
564 STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
565 STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
567 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
568 STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
569 q13s16 = vsubq_s16(q0s16, q5s16);
570 q14s16 = vsubq_s16(q2s16, q6s16);
571 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
572 STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
574 // -----------------------------------------
576 // -----------------------------------------
579 LOAD_FROM_TRANSPOSED(6, 4, 28)
580 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
581 LOAD_FROM_TRANSPOSED(28, 20, 12)
582 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
584 q13s16 = vsubq_s16(q0s16, q1s16);
585 q0s16 = vaddq_s16(q0s16, q1s16);
586 q14s16 = vsubq_s16(q2s16, q3s16);
587 q2s16 = vaddq_s16(q2s16, q3s16);
589 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
593 LOAD_FROM_TRANSPOSED(12, 0, 16)
594 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
595 LOAD_FROM_TRANSPOSED(16, 8, 24)
596 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
598 q4s16 = vaddq_s16(q7s16, q6s16);
599 q7s16 = vsubq_s16(q7s16, q6s16);
600 q6s16 = vsubq_s16(q5s16, q14s16);
601 q5s16 = vaddq_s16(q5s16, q14s16);
603 q8s16 = vaddq_s16(q4s16, q2s16);
604 q9s16 = vaddq_s16(q5s16, q3s16);
605 q10s16 = vaddq_s16(q6s16, q1s16);
606 q11s16 = vaddq_s16(q7s16, q0s16);
607 q12s16 = vsubq_s16(q7s16, q0s16);
608 q13s16 = vsubq_s16(q6s16, q1s16);
609 q14s16 = vsubq_s16(q5s16, q3s16);
610 q15s16 = vsubq_s16(q4s16, q2s16);
612 LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
613 q2s16 = vaddq_s16(q8s16, q1s16);
614 q3s16 = vaddq_s16(q9s16, q0s16);
615 q4s16 = vsubq_s16(q9s16, q0s16);
616 q5s16 = vsubq_s16(q8s16, q1s16);
617 LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
618 q8s16 = vaddq_s16(q4s16, q1s16);
619 q9s16 = vaddq_s16(q5s16, q0s16);
620 q6s16 = vsubq_s16(q5s16, q0s16);
621 q7s16 = vsubq_s16(q4s16, q1s16);
623 if (idct32_pass_loop == 0) {
624 idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
625 q10s16, q11s16, q12s16, q13s16, q14s16,
628 idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
629 q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,