2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/arm/transpose_neon.h"
16 #include "vpx_dsp/txfm_common.h"
18 #define LOAD_FROM_TRANSPOSED(prev, first, second) \
19 q14s16 = vld1q_s16(trans_buf + first * 8); \
20 q13s16 = vld1q_s16(trans_buf + second * 8);
22 #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
23 qA = vld1q_s16(out + first * 32); \
24 qB = vld1q_s16(out + second * 32);
26 #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
27 vst1q_s16(out + first * 32, qA); \
28 vst1q_s16(out + second * 32, qB);
30 #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
31 __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16);
32 static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2,
33 int stride, int16x8_t q6s16,
37 int16x4_t d8s16, d9s16, d10s16, d11s16;
39 d8s16 = vld1_s16((int16_t *)p1);
41 d11s16 = vld1_s16((int16_t *)p2);
43 d9s16 = vld1_s16((int16_t *)p1);
44 d10s16 = vld1_s16((int16_t *)p2);
46 q7s16 = vrshrq_n_s16(q7s16, 6);
47 q8s16 = vrshrq_n_s16(q8s16, 6);
48 q9s16 = vrshrq_n_s16(q9s16, 6);
49 q6s16 = vrshrq_n_s16(q6s16, 6);
51 q7s16 = vreinterpretq_s16_u16(
52 vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16)));
53 q8s16 = vreinterpretq_s16_u16(
54 vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16)));
55 q9s16 = vreinterpretq_s16_u16(
56 vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16)));
57 q6s16 = vreinterpretq_s16_u16(
58 vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16)));
60 d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
61 d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
62 d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
63 d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
65 vst1_s16((int16_t *)p1, d9s16);
67 vst1_s16((int16_t *)p2, d10s16);
69 vst1_s16((int16_t *)p1, d8s16);
70 vst1_s16((int16_t *)p2, d11s16);
74 #define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \
75 __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16);
76 static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2,
77 int stride, int16x8_t q4s16,
81 int16x4_t d4s16, d5s16, d6s16, d7s16;
83 d4s16 = vld1_s16((int16_t *)p1);
85 d7s16 = vld1_s16((int16_t *)p2);
87 d5s16 = vld1_s16((int16_t *)p1);
88 d6s16 = vld1_s16((int16_t *)p2);
90 q5s16 = vrshrq_n_s16(q5s16, 6);
91 q6s16 = vrshrq_n_s16(q6s16, 6);
92 q7s16 = vrshrq_n_s16(q7s16, 6);
93 q4s16 = vrshrq_n_s16(q4s16, 6);
95 q5s16 = vreinterpretq_s16_u16(
96 vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16)));
97 q6s16 = vreinterpretq_s16_u16(
98 vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16)));
99 q7s16 = vreinterpretq_s16_u16(
100 vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16)));
101 q4s16 = vreinterpretq_s16_u16(
102 vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16)));
104 d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
105 d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
106 d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
107 d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
109 vst1_s16((int16_t *)p1, d5s16);
111 vst1_s16((int16_t *)p2, d6s16);
113 vst1_s16((int16_t *)p2, d7s16);
114 vst1_s16((int16_t *)p1, d4s16);
118 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
119 DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
120 static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
121 int16_t first_const, int16_t second_const,
122 int16x8_t *qAs16, int16x8_t *qBs16) {
123 int16x4_t d30s16, d31s16;
124 int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
125 int16x4_t dCs16, dDs16, dAs16, dBs16;
127 dCs16 = vget_low_s16(q14s16);
128 dDs16 = vget_high_s16(q14s16);
129 dAs16 = vget_low_s16(q13s16);
130 dBs16 = vget_high_s16(q13s16);
132 d30s16 = vdup_n_s16(first_const);
133 d31s16 = vdup_n_s16(second_const);
135 q8s32 = vmull_s16(dCs16, d30s16);
136 q10s32 = vmull_s16(dAs16, d31s16);
137 q9s32 = vmull_s16(dDs16, d30s16);
138 q11s32 = vmull_s16(dBs16, d31s16);
139 q12s32 = vmull_s16(dCs16, d31s16);
141 q8s32 = vsubq_s32(q8s32, q10s32);
142 q9s32 = vsubq_s32(q9s32, q11s32);
144 q10s32 = vmull_s16(dDs16, d31s16);
145 q11s32 = vmull_s16(dAs16, d30s16);
146 q15s32 = vmull_s16(dBs16, d30s16);
148 q11s32 = vaddq_s32(q12s32, q11s32);
149 q10s32 = vaddq_s32(q10s32, q15s32);
151 *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14));
152 *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14));
156 static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) {
159 const int stride = 32;
160 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
162 for (i = 0; i < 4; i++, input += 8) {
164 q8s16 = vld1q_s16(in);
166 q9s16 = vld1q_s16(in);
168 q10s16 = vld1q_s16(in);
170 q11s16 = vld1q_s16(in);
172 q12s16 = vld1q_s16(in);
174 q13s16 = vld1q_s16(in);
176 q14s16 = vld1q_s16(in);
178 q15s16 = vld1q_s16(in);
180 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
183 vst1q_s16(t_buf, q8s16);
185 vst1q_s16(t_buf, q9s16);
187 vst1q_s16(t_buf, q10s16);
189 vst1q_s16(t_buf, q11s16);
191 vst1q_s16(t_buf, q12s16);
193 vst1q_s16(t_buf, q13s16);
195 vst1q_s16(t_buf, q14s16);
197 vst1q_s16(t_buf, q15s16);
203 static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16,
204 int16x8_t q3s16, int16x8_t q6s16,
205 int16x8_t q7s16, int16x8_t q8s16,
206 int16x8_t q9s16, int16x8_t q10s16,
207 int16x8_t q11s16, int16x8_t q12s16,
208 int16x8_t q13s16, int16x8_t q14s16,
210 int16x8_t q0s16, q1s16, q4s16, q5s16;
212 STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
213 STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
215 LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
216 q4s16 = vaddq_s16(q2s16, q1s16);
217 q5s16 = vaddq_s16(q3s16, q0s16);
218 q6s16 = vsubq_s16(q3s16, q0s16);
219 q7s16 = vsubq_s16(q2s16, q1s16);
220 STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
221 STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
223 LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
224 q2s16 = vaddq_s16(q10s16, q1s16);
225 q3s16 = vaddq_s16(q11s16, q0s16);
226 q4s16 = vsubq_s16(q11s16, q0s16);
227 q5s16 = vsubq_s16(q10s16, q1s16);
229 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
230 q8s16 = vaddq_s16(q4s16, q1s16);
231 q9s16 = vaddq_s16(q5s16, q0s16);
232 q6s16 = vsubq_s16(q5s16, q0s16);
233 q7s16 = vsubq_s16(q4s16, q1s16);
234 STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
235 STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
237 LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
238 q4s16 = vaddq_s16(q2s16, q1s16);
239 q5s16 = vaddq_s16(q3s16, q0s16);
240 q6s16 = vsubq_s16(q3s16, q0s16);
241 q7s16 = vsubq_s16(q2s16, q1s16);
242 STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
243 STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
245 LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
246 q2s16 = vaddq_s16(q12s16, q1s16);
247 q3s16 = vaddq_s16(q13s16, q0s16);
248 q4s16 = vsubq_s16(q13s16, q0s16);
249 q5s16 = vsubq_s16(q12s16, q1s16);
251 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
252 q8s16 = vaddq_s16(q4s16, q1s16);
253 q9s16 = vaddq_s16(q5s16, q0s16);
254 q6s16 = vsubq_s16(q5s16, q0s16);
255 q7s16 = vsubq_s16(q4s16, q1s16);
256 STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
257 STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
259 LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
260 q4s16 = vaddq_s16(q2s16, q1s16);
261 q5s16 = vaddq_s16(q3s16, q0s16);
262 q6s16 = vsubq_s16(q3s16, q0s16);
263 q7s16 = vsubq_s16(q2s16, q1s16);
264 STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
265 STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
267 LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
268 q2s16 = vaddq_s16(q14s16, q1s16);
269 q3s16 = vaddq_s16(q15s16, q0s16);
270 q4s16 = vsubq_s16(q15s16, q0s16);
271 q5s16 = vsubq_s16(q14s16, q1s16);
273 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
274 q8s16 = vaddq_s16(q4s16, q1s16);
275 q9s16 = vaddq_s16(q5s16, q0s16);
276 q6s16 = vsubq_s16(q5s16, q0s16);
277 q7s16 = vsubq_s16(q4s16, q1s16);
278 STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
279 STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
281 LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
282 q4s16 = vaddq_s16(q2s16, q1s16);
283 q5s16 = vaddq_s16(q3s16, q0s16);
284 q6s16 = vsubq_s16(q3s16, q0s16);
285 q7s16 = vsubq_s16(q2s16, q1s16);
286 STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
287 STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
291 static INLINE void idct32_bands_end_2nd_pass(
292 int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16,
293 int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16,
294 int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16,
295 int16x8_t q14s16, int16x8_t q15s16) {
296 uint8_t *r6 = dest + 31 * stride;
297 uint8_t *r7 = dest /* + 0 * stride*/;
298 uint8_t *r9 = dest + 15 * stride;
299 uint8_t *r10 = dest + 16 * stride;
300 int str2 = stride << 1;
301 int16x8_t q0s16, q1s16, q4s16, q5s16;
303 STORE_COMBINE_CENTER_RESULTS(r10, r9);
307 LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
308 q4s16 = vaddq_s16(q2s16, q1s16);
309 q5s16 = vaddq_s16(q3s16, q0s16);
310 q6s16 = vsubq_s16(q3s16, q0s16);
311 q7s16 = vsubq_s16(q2s16, q1s16);
312 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
316 LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
317 q2s16 = vaddq_s16(q10s16, q1s16);
318 q3s16 = vaddq_s16(q11s16, q0s16);
319 q4s16 = vsubq_s16(q11s16, q0s16);
320 q5s16 = vsubq_s16(q10s16, q1s16);
322 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
323 q8s16 = vaddq_s16(q4s16, q1s16);
324 q9s16 = vaddq_s16(q5s16, q0s16);
325 q6s16 = vsubq_s16(q5s16, q0s16);
326 q7s16 = vsubq_s16(q4s16, q1s16);
327 STORE_COMBINE_CENTER_RESULTS(r10, r9);
331 LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
332 q4s16 = vaddq_s16(q2s16, q1s16);
333 q5s16 = vaddq_s16(q3s16, q0s16);
334 q6s16 = vsubq_s16(q3s16, q0s16);
335 q7s16 = vsubq_s16(q2s16, q1s16);
336 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
340 LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
341 q2s16 = vaddq_s16(q12s16, q1s16);
342 q3s16 = vaddq_s16(q13s16, q0s16);
343 q4s16 = vsubq_s16(q13s16, q0s16);
344 q5s16 = vsubq_s16(q12s16, q1s16);
346 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
347 q8s16 = vaddq_s16(q4s16, q1s16);
348 q9s16 = vaddq_s16(q5s16, q0s16);
349 q6s16 = vsubq_s16(q5s16, q0s16);
350 q7s16 = vsubq_s16(q4s16, q1s16);
351 STORE_COMBINE_CENTER_RESULTS(r10, r9);
355 LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
356 q4s16 = vaddq_s16(q2s16, q1s16);
357 q5s16 = vaddq_s16(q3s16, q0s16);
358 q6s16 = vsubq_s16(q3s16, q0s16);
359 q7s16 = vsubq_s16(q2s16, q1s16);
360 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
364 LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
365 q2s16 = vaddq_s16(q14s16, q1s16);
366 q3s16 = vaddq_s16(q15s16, q0s16);
367 q4s16 = vsubq_s16(q15s16, q0s16);
368 q5s16 = vsubq_s16(q14s16, q1s16);
370 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
371 q8s16 = vaddq_s16(q4s16, q1s16);
372 q9s16 = vaddq_s16(q5s16, q0s16);
373 q6s16 = vsubq_s16(q5s16, q0s16);
374 q7s16 = vsubq_s16(q4s16, q1s16);
375 STORE_COMBINE_CENTER_RESULTS(r10, r9);
377 LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
378 q4s16 = vaddq_s16(q2s16, q1s16);
379 q5s16 = vaddq_s16(q3s16, q0s16);
380 q6s16 = vsubq_s16(q3s16, q0s16);
381 q7s16 = vsubq_s16(q2s16, q1s16);
382 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
386 void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
388 int i, idct32_pass_loop;
389 int16_t trans_buf[32 * 8];
390 int16_t pass1[32 * 32];
391 int16_t pass2[32 * 32];
393 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
394 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
396 for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
398 input = pass1, // the input of pass2 is the result of pass1
400 for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop
401 idct32_transpose_pair(input, trans_buf);
403 // -----------------------------------------
404 // BLOCK A: 16-19,28-31
405 // -----------------------------------------
406 // generate 16,17,30,31
408 LOAD_FROM_TRANSPOSED(0, 1, 31)
409 DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
410 LOAD_FROM_TRANSPOSED(31, 17, 15)
411 DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
413 q4s16 = vaddq_s16(q0s16, q1s16);
414 q13s16 = vsubq_s16(q0s16, q1s16);
415 q6s16 = vaddq_s16(q2s16, q3s16);
416 q14s16 = vsubq_s16(q2s16, q3s16);
418 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
420 // generate 18,19,28,29
422 LOAD_FROM_TRANSPOSED(15, 9, 23)
423 DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
424 LOAD_FROM_TRANSPOSED(23, 25, 7)
425 DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
427 q13s16 = vsubq_s16(q3s16, q2s16);
428 q3s16 = vaddq_s16(q3s16, q2s16);
429 q14s16 = vsubq_s16(q1s16, q0s16);
430 q2s16 = vaddq_s16(q1s16, q0s16);
432 DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
434 q8s16 = vaddq_s16(q4s16, q2s16);
435 q9s16 = vaddq_s16(q5s16, q0s16);
436 q10s16 = vaddq_s16(q7s16, q1s16);
437 q15s16 = vaddq_s16(q6s16, q3s16);
438 q13s16 = vsubq_s16(q5s16, q0s16);
439 q14s16 = vsubq_s16(q7s16, q1s16);
440 STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
441 STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
443 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
444 STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
446 q13s16 = vsubq_s16(q4s16, q2s16);
447 q14s16 = vsubq_s16(q6s16, q3s16);
449 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
450 STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
452 // -----------------------------------------
453 // BLOCK B: 20-23,24-27
454 // -----------------------------------------
455 // generate 20,21,26,27
457 LOAD_FROM_TRANSPOSED(7, 5, 27)
458 DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
459 LOAD_FROM_TRANSPOSED(27, 21, 11)
460 DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
462 q13s16 = vsubq_s16(q0s16, q1s16);
463 q0s16 = vaddq_s16(q0s16, q1s16);
464 q14s16 = vsubq_s16(q2s16, q3s16);
465 q2s16 = vaddq_s16(q2s16, q3s16);
467 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
469 // generate 22,23,24,25
471 LOAD_FROM_TRANSPOSED(11, 13, 19)
472 DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
473 LOAD_FROM_TRANSPOSED(19, 29, 3)
474 DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
476 q14s16 = vsubq_s16(q4s16, q5s16);
477 q5s16 = vaddq_s16(q4s16, q5s16);
478 q13s16 = vsubq_s16(q6s16, q7s16);
479 q6s16 = vaddq_s16(q6s16, q7s16);
481 DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
483 q10s16 = vaddq_s16(q7s16, q1s16);
484 q11s16 = vaddq_s16(q5s16, q0s16);
485 q12s16 = vaddq_s16(q6s16, q2s16);
486 q15s16 = vaddq_s16(q4s16, q3s16);
488 LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
489 q8s16 = vaddq_s16(q14s16, q11s16);
490 q9s16 = vaddq_s16(q13s16, q10s16);
491 q13s16 = vsubq_s16(q13s16, q10s16);
492 q11s16 = vsubq_s16(q14s16, q11s16);
493 STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
494 LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
495 q8s16 = vsubq_s16(q9s16, q12s16);
496 q10s16 = vaddq_s16(q14s16, q15s16);
497 q14s16 = vsubq_s16(q14s16, q15s16);
498 q12s16 = vaddq_s16(q9s16, q12s16);
499 STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
501 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
502 STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
505 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
506 STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
508 q14s16 = vsubq_s16(q5s16, q0s16);
509 q13s16 = vsubq_s16(q6s16, q2s16);
510 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
511 q14s16 = vsubq_s16(q7s16, q1s16);
512 q13s16 = vsubq_s16(q4s16, q3s16);
513 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
515 LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
516 q8s16 = vaddq_s16(q14s16, q1s16);
517 q9s16 = vaddq_s16(q13s16, q6s16);
518 q13s16 = vsubq_s16(q13s16, q6s16);
519 q1s16 = vsubq_s16(q14s16, q1s16);
520 STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
521 LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
522 q14s16 = vsubq_s16(q8s16, q5s16);
523 q10s16 = vaddq_s16(q8s16, q5s16);
524 q11s16 = vaddq_s16(q9s16, q0s16);
525 q0s16 = vsubq_s16(q9s16, q0s16);
526 STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
528 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
529 STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
530 DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16);
531 STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
533 // -----------------------------------------
534 // BLOCK C: 8-10,11-15
535 // -----------------------------------------
536 // generate 8,9,14,15
538 LOAD_FROM_TRANSPOSED(3, 2, 30)
539 DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
540 LOAD_FROM_TRANSPOSED(30, 18, 14)
541 DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
543 q13s16 = vsubq_s16(q0s16, q1s16);
544 q0s16 = vaddq_s16(q0s16, q1s16);
545 q14s16 = vsubq_s16(q2s16, q3s16);
546 q2s16 = vaddq_s16(q2s16, q3s16);
548 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
550 // generate 10,11,12,13
552 LOAD_FROM_TRANSPOSED(14, 10, 22)
553 DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
554 LOAD_FROM_TRANSPOSED(22, 26, 6)
555 DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
557 q14s16 = vsubq_s16(q4s16, q5s16);
558 q5s16 = vaddq_s16(q4s16, q5s16);
559 q13s16 = vsubq_s16(q6s16, q7s16);
560 q6s16 = vaddq_s16(q6s16, q7s16);
562 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
564 q8s16 = vaddq_s16(q0s16, q5s16);
565 q9s16 = vaddq_s16(q1s16, q7s16);
566 q13s16 = vsubq_s16(q1s16, q7s16);
567 q14s16 = vsubq_s16(q3s16, q4s16);
568 q10s16 = vaddq_s16(q3s16, q4s16);
569 q15s16 = vaddq_s16(q2s16, q6s16);
570 STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
571 STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
573 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
574 STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
575 q13s16 = vsubq_s16(q0s16, q5s16);
576 q14s16 = vsubq_s16(q2s16, q6s16);
577 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
578 STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
580 // -----------------------------------------
582 // -----------------------------------------
585 LOAD_FROM_TRANSPOSED(6, 4, 28)
586 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
587 LOAD_FROM_TRANSPOSED(28, 20, 12)
588 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
590 q13s16 = vsubq_s16(q0s16, q1s16);
591 q0s16 = vaddq_s16(q0s16, q1s16);
592 q14s16 = vsubq_s16(q2s16, q3s16);
593 q2s16 = vaddq_s16(q2s16, q3s16);
595 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
599 LOAD_FROM_TRANSPOSED(12, 0, 16)
600 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
601 LOAD_FROM_TRANSPOSED(16, 8, 24)
602 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
604 q4s16 = vaddq_s16(q7s16, q6s16);
605 q7s16 = vsubq_s16(q7s16, q6s16);
606 q6s16 = vsubq_s16(q5s16, q14s16);
607 q5s16 = vaddq_s16(q5s16, q14s16);
609 q8s16 = vaddq_s16(q4s16, q2s16);
610 q9s16 = vaddq_s16(q5s16, q3s16);
611 q10s16 = vaddq_s16(q6s16, q1s16);
612 q11s16 = vaddq_s16(q7s16, q0s16);
613 q12s16 = vsubq_s16(q7s16, q0s16);
614 q13s16 = vsubq_s16(q6s16, q1s16);
615 q14s16 = vsubq_s16(q5s16, q3s16);
616 q15s16 = vsubq_s16(q4s16, q2s16);
618 LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
619 q2s16 = vaddq_s16(q8s16, q1s16);
620 q3s16 = vaddq_s16(q9s16, q0s16);
621 q4s16 = vsubq_s16(q9s16, q0s16);
622 q5s16 = vsubq_s16(q8s16, q1s16);
623 LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
624 q8s16 = vaddq_s16(q4s16, q1s16);
625 q9s16 = vaddq_s16(q5s16, q0s16);
626 q6s16 = vsubq_s16(q5s16, q0s16);
627 q7s16 = vsubq_s16(q4s16, q1s16);
629 if (idct32_pass_loop == 0) {
630 idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
631 q10s16, q11s16, q12s16, q13s16, q14s16,
634 idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16,
635 q8s16, q9s16, q10s16, q11s16, q12s16, q13s16,