2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/arm/transpose_neon.h"
16 #include "vpx_dsp/txfm_common.h"
18 static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
19 int16x8_t *q10s16, int16x8_t *q11s16,
20 int16x8_t *q12s16, int16x8_t *q13s16,
21 int16x8_t *q14s16, int16x8_t *q15s16) {
22 int16x4_t d0s16, d1s16, d2s16, d3s16;
23 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
24 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
25 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
26 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
27 int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
28 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
30 d0s16 = vdup_n_s16(cospi_28_64);
31 d1s16 = vdup_n_s16(cospi_4_64);
32 d2s16 = vdup_n_s16(cospi_12_64);
33 d3s16 = vdup_n_s16(cospi_20_64);
35 d16s16 = vget_low_s16(*q8s16);
36 d17s16 = vget_high_s16(*q8s16);
37 d18s16 = vget_low_s16(*q9s16);
38 d19s16 = vget_high_s16(*q9s16);
39 d20s16 = vget_low_s16(*q10s16);
40 d21s16 = vget_high_s16(*q10s16);
41 d22s16 = vget_low_s16(*q11s16);
42 d23s16 = vget_high_s16(*q11s16);
43 d24s16 = vget_low_s16(*q12s16);
44 d25s16 = vget_high_s16(*q12s16);
45 d26s16 = vget_low_s16(*q13s16);
46 d27s16 = vget_high_s16(*q13s16);
47 d28s16 = vget_low_s16(*q14s16);
48 d29s16 = vget_high_s16(*q14s16);
49 d30s16 = vget_low_s16(*q15s16);
50 d31s16 = vget_high_s16(*q15s16);
52 q2s32 = vmull_s16(d18s16, d0s16);
53 q3s32 = vmull_s16(d19s16, d0s16);
54 q5s32 = vmull_s16(d26s16, d2s16);
55 q6s32 = vmull_s16(d27s16, d2s16);
57 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
58 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
59 q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
60 q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
62 d8s16 = vqrshrn_n_s32(q2s32, 14);
63 d9s16 = vqrshrn_n_s32(q3s32, 14);
64 d10s16 = vqrshrn_n_s32(q5s32, 14);
65 d11s16 = vqrshrn_n_s32(q6s32, 14);
66 q4s16 = vcombine_s16(d8s16, d9s16);
67 q5s16 = vcombine_s16(d10s16, d11s16);
69 q2s32 = vmull_s16(d18s16, d1s16);
70 q3s32 = vmull_s16(d19s16, d1s16);
71 q9s32 = vmull_s16(d26s16, d3s16);
72 q13s32 = vmull_s16(d27s16, d3s16);
74 q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
75 q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
76 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
77 q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
79 d14s16 = vqrshrn_n_s32(q2s32, 14);
80 d15s16 = vqrshrn_n_s32(q3s32, 14);
81 d12s16 = vqrshrn_n_s32(q9s32, 14);
82 d13s16 = vqrshrn_n_s32(q13s32, 14);
83 q6s16 = vcombine_s16(d12s16, d13s16);
84 q7s16 = vcombine_s16(d14s16, d15s16);
86 d0s16 = vdup_n_s16(cospi_16_64);
88 q2s32 = vmull_s16(d16s16, d0s16);
89 q3s32 = vmull_s16(d17s16, d0s16);
90 q13s32 = vmull_s16(d16s16, d0s16);
91 q15s32 = vmull_s16(d17s16, d0s16);
93 q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
94 q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
95 q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
96 q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
98 d0s16 = vdup_n_s16(cospi_24_64);
99 d1s16 = vdup_n_s16(cospi_8_64);
101 d18s16 = vqrshrn_n_s32(q2s32, 14);
102 d19s16 = vqrshrn_n_s32(q3s32, 14);
103 d22s16 = vqrshrn_n_s32(q13s32, 14);
104 d23s16 = vqrshrn_n_s32(q15s32, 14);
105 *q9s16 = vcombine_s16(d18s16, d19s16);
106 *q11s16 = vcombine_s16(d22s16, d23s16);
108 q2s32 = vmull_s16(d20s16, d0s16);
109 q3s32 = vmull_s16(d21s16, d0s16);
110 q8s32 = vmull_s16(d20s16, d1s16);
111 q12s32 = vmull_s16(d21s16, d1s16);
113 q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
114 q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
115 q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
116 q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
118 d26s16 = vqrshrn_n_s32(q2s32, 14);
119 d27s16 = vqrshrn_n_s32(q3s32, 14);
120 d30s16 = vqrshrn_n_s32(q8s32, 14);
121 d31s16 = vqrshrn_n_s32(q12s32, 14);
122 *q13s16 = vcombine_s16(d26s16, d27s16);
123 *q15s16 = vcombine_s16(d30s16, d31s16);
125 q0s16 = vaddq_s16(*q9s16, *q15s16);
126 q1s16 = vaddq_s16(*q11s16, *q13s16);
127 q2s16 = vsubq_s16(*q11s16, *q13s16);
128 q3s16 = vsubq_s16(*q9s16, *q15s16);
130 *q13s16 = vsubq_s16(q4s16, q5s16);
131 q4s16 = vaddq_s16(q4s16, q5s16);
132 *q14s16 = vsubq_s16(q7s16, q6s16);
133 q7s16 = vaddq_s16(q7s16, q6s16);
134 d26s16 = vget_low_s16(*q13s16);
135 d27s16 = vget_high_s16(*q13s16);
136 d28s16 = vget_low_s16(*q14s16);
137 d29s16 = vget_high_s16(*q14s16);
139 d16s16 = vdup_n_s16(cospi_16_64);
141 q9s32 = vmull_s16(d28s16, d16s16);
142 q10s32 = vmull_s16(d29s16, d16s16);
143 q11s32 = vmull_s16(d28s16, d16s16);
144 q12s32 = vmull_s16(d29s16, d16s16);
146 q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
147 q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
148 q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
149 q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
151 d10s16 = vqrshrn_n_s32(q9s32, 14);
152 d11s16 = vqrshrn_n_s32(q10s32, 14);
153 d12s16 = vqrshrn_n_s32(q11s32, 14);
154 d13s16 = vqrshrn_n_s32(q12s32, 14);
155 q5s16 = vcombine_s16(d10s16, d11s16);
156 q6s16 = vcombine_s16(d12s16, d13s16);
158 *q8s16 = vaddq_s16(q0s16, q7s16);
159 *q9s16 = vaddq_s16(q1s16, q6s16);
160 *q10s16 = vaddq_s16(q2s16, q5s16);
161 *q11s16 = vaddq_s16(q3s16, q4s16);
162 *q12s16 = vsubq_s16(q3s16, q4s16);
163 *q13s16 = vsubq_s16(q2s16, q5s16);
164 *q14s16 = vsubq_s16(q1s16, q6s16);
165 *q15s16 = vsubq_s16(q0s16, q7s16);
169 void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
172 uint8x8_t d0u8, d1u8, d2u8, d3u8;
173 uint64x1_t d0u64, d1u64, d2u64, d3u64;
174 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
175 uint16x8_t q8u16, q9u16, q10u16, q11u16;
177 q8s16 = vld1q_s16(input);
178 q9s16 = vld1q_s16(input + 8);
179 q10s16 = vld1q_s16(input + 16);
180 q11s16 = vld1q_s16(input + 24);
181 q12s16 = vld1q_s16(input + 32);
182 q13s16 = vld1q_s16(input + 40);
183 q14s16 = vld1q_s16(input + 48);
184 q15s16 = vld1q_s16(input + 56);
186 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
189 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
192 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
195 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
198 q8s16 = vrshrq_n_s16(q8s16, 5);
199 q9s16 = vrshrq_n_s16(q9s16, 5);
200 q10s16 = vrshrq_n_s16(q10s16, 5);
201 q11s16 = vrshrq_n_s16(q11s16, 5);
202 q12s16 = vrshrq_n_s16(q12s16, 5);
203 q13s16 = vrshrq_n_s16(q13s16, 5);
204 q14s16 = vrshrq_n_s16(q14s16, 5);
205 q15s16 = vrshrq_n_s16(q15s16, 5);
209 d0u64 = vld1_u64((uint64_t *)d1);
211 d1u64 = vld1_u64((uint64_t *)d1);
213 d2u64 = vld1_u64((uint64_t *)d1);
215 d3u64 = vld1_u64((uint64_t *)d1);
218 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
219 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
220 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
221 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
223 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
224 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
225 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
226 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
228 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
230 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
232 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
234 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
242 d0u64 = vld1_u64((uint64_t *)d1);
244 d1u64 = vld1_u64((uint64_t *)d1);
246 d2u64 = vld1_u64((uint64_t *)d1);
248 d3u64 = vld1_u64((uint64_t *)d1);
251 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
252 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
253 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
254 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
256 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
257 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
258 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
259 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
261 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
263 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
265 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
267 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
272 void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
275 uint8x8_t d0u8, d1u8, d2u8, d3u8;
276 int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
277 int16x4_t d26s16, d27s16, d28s16, d29s16;
278 uint64x1_t d0u64, d1u64, d2u64, d3u64;
279 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
280 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
281 uint16x8_t q8u16, q9u16, q10u16, q11u16;
282 int32x4_t q9s32, q10s32, q11s32, q12s32;
284 q8s16 = vld1q_s16(input);
285 q9s16 = vld1q_s16(input + 8);
286 q10s16 = vld1q_s16(input + 16);
287 q11s16 = vld1q_s16(input + 24);
288 q12s16 = vld1q_s16(input + 32);
289 q13s16 = vld1q_s16(input + 40);
290 q14s16 = vld1q_s16(input + 48);
291 q15s16 = vld1q_s16(input + 56);
293 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
296 // First transform rows
298 q0s16 = vdupq_n_s16(cospi_28_64 * 2);
299 q1s16 = vdupq_n_s16(cospi_4_64 * 2);
301 q4s16 = vqrdmulhq_s16(q9s16, q0s16);
303 q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
305 q7s16 = vqrdmulhq_s16(q9s16, q1s16);
307 q1s16 = vdupq_n_s16(cospi_12_64 * 2);
309 q5s16 = vqrdmulhq_s16(q11s16, q0s16);
311 q0s16 = vdupq_n_s16(cospi_16_64 * 2);
313 q6s16 = vqrdmulhq_s16(q11s16, q1s16);
315 // stage 2 & stage 3 - even half
316 q1s16 = vdupq_n_s16(cospi_24_64 * 2);
318 q9s16 = vqrdmulhq_s16(q8s16, q0s16);
320 q0s16 = vdupq_n_s16(cospi_8_64 * 2);
322 q13s16 = vqrdmulhq_s16(q10s16, q1s16);
324 q15s16 = vqrdmulhq_s16(q10s16, q0s16);
327 q0s16 = vaddq_s16(q9s16, q15s16);
328 q1s16 = vaddq_s16(q9s16, q13s16);
329 q2s16 = vsubq_s16(q9s16, q13s16);
330 q3s16 = vsubq_s16(q9s16, q15s16);
332 // stage 2 - odd half
333 q13s16 = vsubq_s16(q4s16, q5s16);
334 q4s16 = vaddq_s16(q4s16, q5s16);
335 q14s16 = vsubq_s16(q7s16, q6s16);
336 q7s16 = vaddq_s16(q7s16, q6s16);
337 d26s16 = vget_low_s16(q13s16);
338 d27s16 = vget_high_s16(q13s16);
339 d28s16 = vget_low_s16(q14s16);
340 d29s16 = vget_high_s16(q14s16);
342 d16s16 = vdup_n_s16(cospi_16_64);
343 q9s32 = vmull_s16(d28s16, d16s16);
344 q10s32 = vmull_s16(d29s16, d16s16);
345 q11s32 = vmull_s16(d28s16, d16s16);
346 q12s32 = vmull_s16(d29s16, d16s16);
348 q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
349 q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
350 q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
351 q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
353 d10s16 = vqrshrn_n_s32(q9s32, 14);
354 d11s16 = vqrshrn_n_s32(q10s32, 14);
355 d12s16 = vqrshrn_n_s32(q11s32, 14);
356 d13s16 = vqrshrn_n_s32(q12s32, 14);
357 q5s16 = vcombine_s16(d10s16, d11s16);
358 q6s16 = vcombine_s16(d12s16, d13s16);
361 q8s16 = vaddq_s16(q0s16, q7s16);
362 q9s16 = vaddq_s16(q1s16, q6s16);
363 q10s16 = vaddq_s16(q2s16, q5s16);
364 q11s16 = vaddq_s16(q3s16, q4s16);
365 q12s16 = vsubq_s16(q3s16, q4s16);
366 q13s16 = vsubq_s16(q2s16, q5s16);
367 q14s16 = vsubq_s16(q1s16, q6s16);
368 q15s16 = vsubq_s16(q0s16, q7s16);
370 transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
373 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
376 q8s16 = vrshrq_n_s16(q8s16, 5);
377 q9s16 = vrshrq_n_s16(q9s16, 5);
378 q10s16 = vrshrq_n_s16(q10s16, 5);
379 q11s16 = vrshrq_n_s16(q11s16, 5);
380 q12s16 = vrshrq_n_s16(q12s16, 5);
381 q13s16 = vrshrq_n_s16(q13s16, 5);
382 q14s16 = vrshrq_n_s16(q14s16, 5);
383 q15s16 = vrshrq_n_s16(q15s16, 5);
387 d0u64 = vld1_u64((uint64_t *)d1);
389 d1u64 = vld1_u64((uint64_t *)d1);
391 d2u64 = vld1_u64((uint64_t *)d1);
393 d3u64 = vld1_u64((uint64_t *)d1);
396 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
397 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
398 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
399 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
401 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
402 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
403 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
404 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
406 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
408 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
410 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
412 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
420 d0u64 = vld1_u64((uint64_t *)d1);
422 d1u64 = vld1_u64((uint64_t *)d1);
424 d2u64 = vld1_u64((uint64_t *)d1);
426 d3u64 = vld1_u64((uint64_t *)d1);
429 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
430 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
431 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
432 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
434 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
435 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
436 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
437 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
439 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
441 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
443 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
445 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));