2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/txfm_common_sse2.h"
15 #define RECON_AND_STORE4X4(dest, in_x) \
17 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
18 d0 = _mm_unpacklo_epi8(d0, zero); \
19 d0 = _mm_add_epi16(in_x, d0); \
20 d0 = _mm_packus_epi16(d0, d0); \
21 *(int *)(dest) = _mm_cvtsi128_si32(d0); \
24 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
26 const __m128i zero = _mm_setzero_si128();
27 const __m128i eight = _mm_set1_epi16(8);
28 const __m128i cst = _mm_setr_epi16(
29 (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
30 (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
31 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
32 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
33 __m128i input0, input1, input2, input3;
36 input0 = load_input_data(input);
37 input2 = load_input_data(input + 8);
39 // Construct i3, i1, i3, i1, i2, i0, i2, i0
40 input0 = _mm_shufflelo_epi16(input0, 0xd8);
41 input0 = _mm_shufflehi_epi16(input0, 0xd8);
42 input2 = _mm_shufflelo_epi16(input2, 0xd8);
43 input2 = _mm_shufflehi_epi16(input2, 0xd8);
45 input1 = _mm_unpackhi_epi32(input0, input0);
46 input0 = _mm_unpacklo_epi32(input0, input0);
47 input3 = _mm_unpackhi_epi32(input2, input2);
48 input2 = _mm_unpacklo_epi32(input2, input2);
51 input0 = _mm_madd_epi16(input0, cst);
52 input1 = _mm_madd_epi16(input1, cst);
53 input2 = _mm_madd_epi16(input2, cst);
54 input3 = _mm_madd_epi16(input3, cst);
56 input0 = _mm_add_epi32(input0, rounding);
57 input1 = _mm_add_epi32(input1, rounding);
58 input2 = _mm_add_epi32(input2, rounding);
59 input3 = _mm_add_epi32(input3, rounding);
61 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
62 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
63 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
64 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
67 input0 = _mm_packs_epi32(input0, input1);
68 input1 = _mm_packs_epi32(input2, input3);
71 input2 = _mm_unpacklo_epi16(input0, input1);
72 input3 = _mm_unpackhi_epi16(input0, input1);
73 input0 = _mm_unpacklo_epi32(input2, input3);
74 input1 = _mm_unpackhi_epi32(input2, input3);
76 // Switch column2, column 3, and then, we got:
77 // input2: column1, column 0; input3: column2, column 3.
78 input1 = _mm_shuffle_epi32(input1, 0x4e);
79 input2 = _mm_add_epi16(input0, input1);
80 input3 = _mm_sub_epi16(input0, input1);
83 // Construct i3, i1, i3, i1, i2, i0, i2, i0
84 input0 = _mm_unpacklo_epi32(input2, input2);
85 input1 = _mm_unpackhi_epi32(input2, input2);
86 input2 = _mm_unpackhi_epi32(input3, input3);
87 input3 = _mm_unpacklo_epi32(input3, input3);
90 input0 = _mm_madd_epi16(input0, cst);
91 input1 = _mm_madd_epi16(input1, cst);
92 input2 = _mm_madd_epi16(input2, cst);
93 input3 = _mm_madd_epi16(input3, cst);
95 input0 = _mm_add_epi32(input0, rounding);
96 input1 = _mm_add_epi32(input1, rounding);
97 input2 = _mm_add_epi32(input2, rounding);
98 input3 = _mm_add_epi32(input3, rounding);
100 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
101 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
102 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
103 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
106 input0 = _mm_packs_epi32(input0, input2);
107 input1 = _mm_packs_epi32(input1, input3);
110 input2 = _mm_unpacklo_epi16(input0, input1);
111 input3 = _mm_unpackhi_epi16(input0, input1);
112 input0 = _mm_unpacklo_epi32(input2, input3);
113 input1 = _mm_unpackhi_epi32(input2, input3);
115 // Switch column2, column 3, and then, we got:
116 // input2: column1, column 0; input3: column2, column 3.
117 input1 = _mm_shuffle_epi32(input1, 0x4e);
118 input2 = _mm_add_epi16(input0, input1);
119 input3 = _mm_sub_epi16(input0, input1);
121 // Final round and shift
122 input2 = _mm_add_epi16(input2, eight);
123 input3 = _mm_add_epi16(input3, eight);
125 input2 = _mm_srai_epi16(input2, 4);
126 input3 = _mm_srai_epi16(input3, 4);
128 // Reconstruction and Store
130 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
131 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
132 d0 = _mm_unpacklo_epi32(d0,
133 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
134 d2 = _mm_unpacklo_epi32(
135 _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
136 d0 = _mm_unpacklo_epi8(d0, zero);
137 d2 = _mm_unpacklo_epi8(d2, zero);
138 d0 = _mm_add_epi16(d0, input2);
139 d2 = _mm_add_epi16(d2, input3);
140 d0 = _mm_packus_epi16(d0, d2);
142 *(int *)dest = _mm_cvtsi128_si32(d0);
144 d0 = _mm_srli_si128(d0, 4);
145 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
147 d0 = _mm_srli_si128(d0, 4);
148 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
150 d0 = _mm_srli_si128(d0, 4);
151 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
155 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
158 const __m128i zero = _mm_setzero_si128();
161 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
162 a = (int)dct_const_round_shift(a * cospi_16_64);
163 a = ROUND_POWER_OF_TWO(a, 4);
165 dc_value = _mm_set1_epi16(a);
167 RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
168 RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
169 RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
170 RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
173 static INLINE void transpose_4x4(__m128i *res) {
174 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
175 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
177 res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
178 res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
181 void idct4_sse2(__m128i *in) {
182 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
183 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
184 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
185 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
186 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
191 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
192 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
193 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
194 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
195 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
196 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
198 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
199 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
200 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
201 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
203 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
204 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
205 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
206 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
208 u[0] = _mm_packs_epi32(v[0], v[1]);
209 u[1] = _mm_packs_epi32(v[3], v[2]);
212 in[0] = _mm_add_epi16(u[0], u[1]);
213 in[1] = _mm_sub_epi16(u[0], u[1]);
214 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
217 void iadst4_sse2(__m128i *in) {
218 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
219 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
220 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
221 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
222 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
223 const __m128i kZero = _mm_set1_epi16(0);
224 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
225 __m128i u[8], v[8], in7;
228 in7 = _mm_srli_si128(in[1], 8);
229 in7 = _mm_add_epi16(in7, in[0]);
230 in7 = _mm_sub_epi16(in7, in[1]);
232 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
233 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
234 u[2] = _mm_unpacklo_epi16(in7, kZero);
235 u[3] = _mm_unpackhi_epi16(in[0], kZero);
237 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
238 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
239 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
240 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
241 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
242 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
244 u[0] = _mm_add_epi32(v[0], v[1]);
245 u[1] = _mm_add_epi32(v[3], v[4]);
247 u[3] = _mm_add_epi32(u[0], u[1]);
248 u[4] = _mm_slli_epi32(v[5], 2);
249 u[5] = _mm_add_epi32(u[3], v[5]);
250 u[6] = _mm_sub_epi32(u[5], u[4]);
252 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
253 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
254 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
255 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
257 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
258 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
259 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
260 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
262 in[0] = _mm_packs_epi32(u[0], u[1]);
263 in[1] = _mm_packs_epi32(u[2], u[3]);
266 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
267 out2, out3, out4, out5, out6, out7) \
269 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
270 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
271 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
272 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
273 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
274 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
275 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
276 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
278 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
279 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
280 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
281 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
282 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
283 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
284 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
285 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
287 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
288 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
289 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
290 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
291 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
292 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
293 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
294 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
297 #define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
299 const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
300 const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
301 const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
302 const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
304 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
305 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
306 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
307 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
309 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
310 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
311 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
312 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
315 #define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
317 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
318 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
319 out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
320 out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
323 // Define Macro for multiplying elements by constants and adding them together.
324 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
325 res0, res1, res2, res3) \
327 tmp0 = _mm_madd_epi16(lo_0, cst0); \
328 tmp1 = _mm_madd_epi16(hi_0, cst0); \
329 tmp2 = _mm_madd_epi16(lo_0, cst1); \
330 tmp3 = _mm_madd_epi16(hi_0, cst1); \
331 tmp4 = _mm_madd_epi16(lo_1, cst2); \
332 tmp5 = _mm_madd_epi16(hi_1, cst2); \
333 tmp6 = _mm_madd_epi16(lo_1, cst3); \
334 tmp7 = _mm_madd_epi16(hi_1, cst3); \
336 tmp0 = _mm_add_epi32(tmp0, rounding); \
337 tmp1 = _mm_add_epi32(tmp1, rounding); \
338 tmp2 = _mm_add_epi32(tmp2, rounding); \
339 tmp3 = _mm_add_epi32(tmp3, rounding); \
340 tmp4 = _mm_add_epi32(tmp4, rounding); \
341 tmp5 = _mm_add_epi32(tmp5, rounding); \
342 tmp6 = _mm_add_epi32(tmp6, rounding); \
343 tmp7 = _mm_add_epi32(tmp7, rounding); \
345 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
346 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
347 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
348 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
349 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
350 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
351 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
352 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
354 res0 = _mm_packs_epi32(tmp0, tmp1); \
355 res1 = _mm_packs_epi32(tmp2, tmp3); \
356 res2 = _mm_packs_epi32(tmp4, tmp5); \
357 res3 = _mm_packs_epi32(tmp6, tmp7); \
360 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
362 tmp0 = _mm_madd_epi16(lo_0, cst0); \
363 tmp1 = _mm_madd_epi16(hi_0, cst0); \
364 tmp2 = _mm_madd_epi16(lo_0, cst1); \
365 tmp3 = _mm_madd_epi16(hi_0, cst1); \
367 tmp0 = _mm_add_epi32(tmp0, rounding); \
368 tmp1 = _mm_add_epi32(tmp1, rounding); \
369 tmp2 = _mm_add_epi32(tmp2, rounding); \
370 tmp3 = _mm_add_epi32(tmp3, rounding); \
372 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
373 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
374 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
375 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
377 res0 = _mm_packs_epi32(tmp0, tmp1); \
378 res1 = _mm_packs_epi32(tmp2, tmp3); \
381 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
382 out4, out5, out6, out7) \
386 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
387 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
388 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
389 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
391 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
392 stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
397 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
398 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
399 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
400 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
402 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
403 stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
405 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
406 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
407 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
408 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
413 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
414 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
416 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
417 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
418 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
419 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
421 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
422 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
423 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
424 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
426 tmp0 = _mm_add_epi32(tmp0, rounding); \
427 tmp1 = _mm_add_epi32(tmp1, rounding); \
428 tmp2 = _mm_add_epi32(tmp2, rounding); \
429 tmp3 = _mm_add_epi32(tmp3, rounding); \
431 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
432 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
433 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
434 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
436 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
437 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
441 out0 = _mm_adds_epi16(stp1_0, stp2_7); \
442 out1 = _mm_adds_epi16(stp1_1, stp1_6); \
443 out2 = _mm_adds_epi16(stp1_2, stp1_5); \
444 out3 = _mm_adds_epi16(stp1_3, stp2_4); \
445 out4 = _mm_subs_epi16(stp1_3, stp2_4); \
446 out5 = _mm_subs_epi16(stp1_2, stp1_5); \
447 out6 = _mm_subs_epi16(stp1_1, stp1_6); \
448 out7 = _mm_subs_epi16(stp1_0, stp2_7); \
451 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
453 const __m128i zero = _mm_setzero_si128();
454 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
455 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
456 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
457 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
458 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
459 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
460 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
461 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
462 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
463 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
465 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
466 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
467 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
468 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
472 in0 = load_input_data(input);
473 in1 = load_input_data(input + 8 * 1);
474 in2 = load_input_data(input + 8 * 2);
475 in3 = load_input_data(input + 8 * 3);
476 in4 = load_input_data(input + 8 * 4);
477 in5 = load_input_data(input + 8 * 5);
478 in6 = load_input_data(input + 8 * 6);
479 in7 = load_input_data(input + 8 * 7);
482 for (i = 0; i < 2; i++) {
483 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
484 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
487 // 4-stage 1D idct8x8
488 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
492 // Final rounding and shift
493 in0 = _mm_adds_epi16(in0, final_rounding);
494 in1 = _mm_adds_epi16(in1, final_rounding);
495 in2 = _mm_adds_epi16(in2, final_rounding);
496 in3 = _mm_adds_epi16(in3, final_rounding);
497 in4 = _mm_adds_epi16(in4, final_rounding);
498 in5 = _mm_adds_epi16(in5, final_rounding);
499 in6 = _mm_adds_epi16(in6, final_rounding);
500 in7 = _mm_adds_epi16(in7, final_rounding);
502 in0 = _mm_srai_epi16(in0, 5);
503 in1 = _mm_srai_epi16(in1, 5);
504 in2 = _mm_srai_epi16(in2, 5);
505 in3 = _mm_srai_epi16(in3, 5);
506 in4 = _mm_srai_epi16(in4, 5);
507 in5 = _mm_srai_epi16(in5, 5);
508 in6 = _mm_srai_epi16(in6, 5);
509 in7 = _mm_srai_epi16(in7, 5);
511 RECON_AND_STORE(dest + 0 * stride, in0);
512 RECON_AND_STORE(dest + 1 * stride, in1);
513 RECON_AND_STORE(dest + 2 * stride, in2);
514 RECON_AND_STORE(dest + 3 * stride, in3);
515 RECON_AND_STORE(dest + 4 * stride, in4);
516 RECON_AND_STORE(dest + 5 * stride, in5);
517 RECON_AND_STORE(dest + 6 * stride, in6);
518 RECON_AND_STORE(dest + 7 * stride, in7);
521 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
524 const __m128i zero = _mm_setzero_si128();
527 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
528 a = (int)dct_const_round_shift(a * cospi_16_64);
529 a = ROUND_POWER_OF_TWO(a, 5);
531 dc_value = _mm_set1_epi16(a);
533 RECON_AND_STORE(dest + 0 * stride, dc_value);
534 RECON_AND_STORE(dest + 1 * stride, dc_value);
535 RECON_AND_STORE(dest + 2 * stride, dc_value);
536 RECON_AND_STORE(dest + 3 * stride, dc_value);
537 RECON_AND_STORE(dest + 4 * stride, dc_value);
538 RECON_AND_STORE(dest + 5 * stride, dc_value);
539 RECON_AND_STORE(dest + 6 * stride, dc_value);
540 RECON_AND_STORE(dest + 7 * stride, dc_value);
543 void idct8_sse2(__m128i *in) {
544 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
545 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
546 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
547 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
548 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
549 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
550 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
551 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
552 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
554 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
555 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
556 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
557 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
559 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
560 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
561 in1, in2, in3, in4, in5, in6, in7);
563 // 4-stage 1D idct8x8
564 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
565 in[4], in[5], in[6], in[7]);
568 void iadst8_sse2(__m128i *in) {
569 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
570 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
571 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
572 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
573 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
574 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
575 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
576 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
577 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
578 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
579 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
580 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
581 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
582 const __m128i k__const_0 = _mm_set1_epi16(0);
583 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
585 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
586 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
587 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
588 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
589 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
592 array_transpose_8x8(in, in);
594 // properly aligned for butterfly input
604 // column transformation
606 // interleave and multiply/add into 32-bit integer
607 s0 = _mm_unpacklo_epi16(in0, in1);
608 s1 = _mm_unpackhi_epi16(in0, in1);
609 s2 = _mm_unpacklo_epi16(in2, in3);
610 s3 = _mm_unpackhi_epi16(in2, in3);
611 s4 = _mm_unpacklo_epi16(in4, in5);
612 s5 = _mm_unpackhi_epi16(in4, in5);
613 s6 = _mm_unpacklo_epi16(in6, in7);
614 s7 = _mm_unpackhi_epi16(in6, in7);
616 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
617 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
618 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
619 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
620 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
621 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
622 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
623 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
624 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
625 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
626 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
627 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
628 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
629 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
630 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
631 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
634 w0 = _mm_add_epi32(u0, u8);
635 w1 = _mm_add_epi32(u1, u9);
636 w2 = _mm_add_epi32(u2, u10);
637 w3 = _mm_add_epi32(u3, u11);
638 w4 = _mm_add_epi32(u4, u12);
639 w5 = _mm_add_epi32(u5, u13);
640 w6 = _mm_add_epi32(u6, u14);
641 w7 = _mm_add_epi32(u7, u15);
642 w8 = _mm_sub_epi32(u0, u8);
643 w9 = _mm_sub_epi32(u1, u9);
644 w10 = _mm_sub_epi32(u2, u10);
645 w11 = _mm_sub_epi32(u3, u11);
646 w12 = _mm_sub_epi32(u4, u12);
647 w13 = _mm_sub_epi32(u5, u13);
648 w14 = _mm_sub_epi32(u6, u14);
649 w15 = _mm_sub_epi32(u7, u15);
651 // shift and rounding
652 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
653 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
654 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
655 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
656 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
657 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
658 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
659 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
660 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
661 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
662 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
663 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
664 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
665 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
666 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
667 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
669 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
670 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
671 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
672 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
673 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
674 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
675 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
676 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
677 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
678 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
679 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
680 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
681 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
682 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
683 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
684 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
686 // back to 16-bit and pack 8 integers into __m128i
687 in[0] = _mm_packs_epi32(u0, u1);
688 in[1] = _mm_packs_epi32(u2, u3);
689 in[2] = _mm_packs_epi32(u4, u5);
690 in[3] = _mm_packs_epi32(u6, u7);
691 in[4] = _mm_packs_epi32(u8, u9);
692 in[5] = _mm_packs_epi32(u10, u11);
693 in[6] = _mm_packs_epi32(u12, u13);
694 in[7] = _mm_packs_epi32(u14, u15);
697 s0 = _mm_add_epi16(in[0], in[2]);
698 s1 = _mm_add_epi16(in[1], in[3]);
699 s2 = _mm_sub_epi16(in[0], in[2]);
700 s3 = _mm_sub_epi16(in[1], in[3]);
701 u0 = _mm_unpacklo_epi16(in[4], in[5]);
702 u1 = _mm_unpackhi_epi16(in[4], in[5]);
703 u2 = _mm_unpacklo_epi16(in[6], in[7]);
704 u3 = _mm_unpackhi_epi16(in[6], in[7]);
706 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
707 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
708 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
709 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
710 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
711 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
712 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
713 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
715 w0 = _mm_add_epi32(v0, v4);
716 w1 = _mm_add_epi32(v1, v5);
717 w2 = _mm_add_epi32(v2, v6);
718 w3 = _mm_add_epi32(v3, v7);
719 w4 = _mm_sub_epi32(v0, v4);
720 w5 = _mm_sub_epi32(v1, v5);
721 w6 = _mm_sub_epi32(v2, v6);
722 w7 = _mm_sub_epi32(v3, v7);
724 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
725 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
726 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
727 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
728 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
729 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
730 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
731 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
733 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
734 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
735 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
736 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
737 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
738 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
739 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
740 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
742 // back to 16-bit intergers
743 s4 = _mm_packs_epi32(u0, u1);
744 s5 = _mm_packs_epi32(u2, u3);
745 s6 = _mm_packs_epi32(u4, u5);
746 s7 = _mm_packs_epi32(u6, u7);
749 u0 = _mm_unpacklo_epi16(s2, s3);
750 u1 = _mm_unpackhi_epi16(s2, s3);
751 u2 = _mm_unpacklo_epi16(s6, s7);
752 u3 = _mm_unpackhi_epi16(s6, s7);
754 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
755 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
756 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
757 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
758 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
759 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
760 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
761 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
763 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
764 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
765 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
766 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
767 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
768 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
769 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
770 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
772 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
773 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
774 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
775 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
776 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
777 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
778 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
779 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
781 s2 = _mm_packs_epi32(v0, v1);
782 s3 = _mm_packs_epi32(v2, v3);
783 s6 = _mm_packs_epi32(v4, v5);
784 s7 = _mm_packs_epi32(v6, v7);
787 in[1] = _mm_sub_epi16(k__const_0, s4);
789 in[3] = _mm_sub_epi16(k__const_0, s2);
791 in[5] = _mm_sub_epi16(k__const_0, s7);
793 in[7] = _mm_sub_epi16(k__const_0, s1);
796 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
798 const __m128i zero = _mm_setzero_si128();
799 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
800 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
801 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
802 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
803 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
804 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
805 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
806 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
807 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
808 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
809 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
811 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
812 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
813 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
814 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
816 // Rows. Load 4-row input data.
817 in0 = load_input_data(input);
818 in1 = load_input_data(input + 8 * 1);
819 in2 = load_input_data(input + 8 * 2);
820 in3 = load_input_data(input + 8 * 3);
823 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
826 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
827 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
829 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
830 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
831 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
832 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
834 tmp0 = _mm_add_epi32(tmp0, rounding);
835 tmp2 = _mm_add_epi32(tmp2, rounding);
836 tmp4 = _mm_add_epi32(tmp4, rounding);
837 tmp6 = _mm_add_epi32(tmp6, rounding);
838 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
839 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
840 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
841 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
843 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
844 stp1_5 = _mm_packs_epi32(tmp4, tmp6);
849 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
850 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
852 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
853 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
854 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
855 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
857 tmp0 = _mm_add_epi32(tmp0, rounding);
858 tmp2 = _mm_add_epi32(tmp2, rounding);
859 tmp4 = _mm_add_epi32(tmp4, rounding);
860 tmp6 = _mm_add_epi32(tmp6, rounding);
861 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
862 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
863 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
864 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
866 stp2_0 = _mm_packs_epi32(tmp0, tmp2);
867 stp2_2 = _mm_packs_epi32(tmp6, tmp4);
869 tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
870 tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
873 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
874 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
879 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
881 tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
882 tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
884 stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
885 stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
887 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
888 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
890 tmp0 = _mm_add_epi32(tmp0, rounding);
891 tmp2 = _mm_add_epi32(tmp2, rounding);
892 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
893 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
895 stp1_5 = _mm_packs_epi32(tmp0, tmp2);
899 tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
900 tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
901 tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
902 tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
904 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
906 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
908 // Final rounding and shift
909 in0 = _mm_adds_epi16(in0, final_rounding);
910 in1 = _mm_adds_epi16(in1, final_rounding);
911 in2 = _mm_adds_epi16(in2, final_rounding);
912 in3 = _mm_adds_epi16(in3, final_rounding);
913 in4 = _mm_adds_epi16(in4, final_rounding);
914 in5 = _mm_adds_epi16(in5, final_rounding);
915 in6 = _mm_adds_epi16(in6, final_rounding);
916 in7 = _mm_adds_epi16(in7, final_rounding);
918 in0 = _mm_srai_epi16(in0, 5);
919 in1 = _mm_srai_epi16(in1, 5);
920 in2 = _mm_srai_epi16(in2, 5);
921 in3 = _mm_srai_epi16(in3, 5);
922 in4 = _mm_srai_epi16(in4, 5);
923 in5 = _mm_srai_epi16(in5, 5);
924 in6 = _mm_srai_epi16(in6, 5);
925 in7 = _mm_srai_epi16(in7, 5);
927 RECON_AND_STORE(dest + 0 * stride, in0);
928 RECON_AND_STORE(dest + 1 * stride, in1);
929 RECON_AND_STORE(dest + 2 * stride, in2);
930 RECON_AND_STORE(dest + 3 * stride, in3);
931 RECON_AND_STORE(dest + 4 * stride, in4);
932 RECON_AND_STORE(dest + 5 * stride, in5);
933 RECON_AND_STORE(dest + 6 * stride, in6);
934 RECON_AND_STORE(dest + 7 * stride, in7);
940 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
941 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
942 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
943 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
944 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
945 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
946 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
947 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
949 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
950 stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
952 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
953 stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
958 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
959 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
960 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
961 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
963 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
964 stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
966 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
967 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
968 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
969 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
971 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
972 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
973 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
974 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
979 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
980 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
981 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
982 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
984 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
985 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
986 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
987 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
989 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
990 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
992 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
993 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
994 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
995 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
997 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
998 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
1004 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1005 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1007 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
1008 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
1009 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
1010 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
1012 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1013 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1014 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1015 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1017 tmp0 = _mm_add_epi32(tmp0, rounding); \
1018 tmp1 = _mm_add_epi32(tmp1, rounding); \
1019 tmp2 = _mm_add_epi32(tmp2, rounding); \
1020 tmp3 = _mm_add_epi32(tmp3, rounding); \
1022 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1023 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1024 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1025 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1027 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1028 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1030 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1031 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1032 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1033 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1035 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1036 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1037 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1038 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1043 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1044 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1045 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1046 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1048 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1049 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1050 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1051 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1052 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1053 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1054 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1055 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1057 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
1058 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
1065 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
1066 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
1067 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
1068 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
1070 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
1071 stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
1077 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
1078 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
1080 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
1082 stp1_9 = stp1_8_0; \
1083 stp1_10 = stp1_11; \
1085 stp1_13 = stp1_12_0; \
1086 stp1_14 = stp1_15; \
1091 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
1092 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
1094 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
1095 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
1096 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1097 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1099 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
1103 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
1104 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
1110 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1111 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1116 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
1117 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
1118 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
1119 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
1121 tmp0 = _mm_add_epi32(tmp0, rounding); \
1122 tmp1 = _mm_add_epi32(tmp1, rounding); \
1123 tmp2 = _mm_add_epi32(tmp2, rounding); \
1124 tmp3 = _mm_add_epi32(tmp3, rounding); \
1126 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
1127 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
1128 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
1129 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
1131 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
1132 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
1134 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
1135 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1136 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1137 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
1139 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
1140 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1141 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1142 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
1147 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1148 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1149 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1150 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1152 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
1153 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1154 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1155 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
1156 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
1157 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1158 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1159 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
1161 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
1162 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
1166 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
1168 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
1169 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1170 const __m128i zero = _mm_setzero_si128();
1172 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1173 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1174 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1175 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
1176 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1177 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
1178 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1179 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1181 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1182 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1183 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1184 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
1186 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1187 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1188 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1189 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
1190 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1191 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1192 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1193 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1195 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1197 __m128i in[16], l[16], r[16], *curr1;
1198 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
1199 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
1200 stp1_8_0, stp1_12_0;
1201 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1202 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
1203 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1207 for (i = 0; i < 2; i++) {
1211 in[0] = load_input_data(input);
1212 in[8] = load_input_data(input + 8 * 1);
1213 in[1] = load_input_data(input + 8 * 2);
1214 in[9] = load_input_data(input + 8 * 3);
1215 in[2] = load_input_data(input + 8 * 4);
1216 in[10] = load_input_data(input + 8 * 5);
1217 in[3] = load_input_data(input + 8 * 6);
1218 in[11] = load_input_data(input + 8 * 7);
1219 in[4] = load_input_data(input + 8 * 8);
1220 in[12] = load_input_data(input + 8 * 9);
1221 in[5] = load_input_data(input + 8 * 10);
1222 in[13] = load_input_data(input + 8 * 11);
1223 in[6] = load_input_data(input + 8 * 12);
1224 in[14] = load_input_data(input + 8 * 13);
1225 in[7] = load_input_data(input + 8 * 14);
1226 in[15] = load_input_data(input + 8 * 15);
1228 array_transpose_8x8(in, in);
1229 array_transpose_8x8(in + 8, in + 8);
1234 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
1235 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
1236 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
1237 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
1238 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
1239 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
1240 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
1241 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
1242 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
1243 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
1244 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
1245 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
1246 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
1247 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
1248 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
1249 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
1254 for (i = 0; i < 2; i++) {
1257 array_transpose_8x8(l + i * 8, in);
1258 array_transpose_8x8(r + i * 8, in + 8);
1263 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1264 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1265 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1266 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1267 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1268 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1269 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1270 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1271 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1272 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1273 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1274 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1275 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1276 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1277 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1278 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1280 for (j = 0; j < 16; ++j) {
1281 // Final rounding and shift
1282 in[j] = _mm_adds_epi16(in[j], final_rounding);
1283 in[j] = _mm_srai_epi16(in[j], 6);
1284 RECON_AND_STORE(dest + j * stride, in[j]);
1291 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1294 const __m128i zero = _mm_setzero_si128();
1297 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
1298 a = (int)dct_const_round_shift(a * cospi_16_64);
1299 a = ROUND_POWER_OF_TWO(a, 6);
1301 dc_value = _mm_set1_epi16(a);
1303 for (i = 0; i < 16; ++i) {
1304 RECON_AND_STORE(dest + 0, dc_value);
1305 RECON_AND_STORE(dest + 8, dc_value);
1310 static void iadst16_8col(__m128i *in) {
1311 // perform 16x16 1-D ADST for 8 columns
1312 __m128i s[16], x[16], u[32], v[32];
1313 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1314 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1315 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1316 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1317 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1318 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1319 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1320 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1321 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1322 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1323 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1324 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1325 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1326 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1327 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1328 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1329 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1330 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1331 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1332 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1333 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1334 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1335 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1336 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1337 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1338 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
1339 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1340 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1341 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1342 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1343 const __m128i kZero = _mm_set1_epi16(0);
1345 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1346 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1347 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1348 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1349 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1350 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1351 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1352 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1353 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1354 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1355 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1356 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1357 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1358 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1359 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1360 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1362 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1363 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1364 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1365 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1366 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1367 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1368 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1369 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1370 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1371 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1372 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1373 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1374 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1375 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1376 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1377 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1378 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1379 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1380 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1381 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1382 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1383 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1384 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1385 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1386 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1387 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1388 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1389 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1390 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1391 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1392 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1393 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1395 u[0] = _mm_add_epi32(v[0], v[16]);
1396 u[1] = _mm_add_epi32(v[1], v[17]);
1397 u[2] = _mm_add_epi32(v[2], v[18]);
1398 u[3] = _mm_add_epi32(v[3], v[19]);
1399 u[4] = _mm_add_epi32(v[4], v[20]);
1400 u[5] = _mm_add_epi32(v[5], v[21]);
1401 u[6] = _mm_add_epi32(v[6], v[22]);
1402 u[7] = _mm_add_epi32(v[7], v[23]);
1403 u[8] = _mm_add_epi32(v[8], v[24]);
1404 u[9] = _mm_add_epi32(v[9], v[25]);
1405 u[10] = _mm_add_epi32(v[10], v[26]);
1406 u[11] = _mm_add_epi32(v[11], v[27]);
1407 u[12] = _mm_add_epi32(v[12], v[28]);
1408 u[13] = _mm_add_epi32(v[13], v[29]);
1409 u[14] = _mm_add_epi32(v[14], v[30]);
1410 u[15] = _mm_add_epi32(v[15], v[31]);
1411 u[16] = _mm_sub_epi32(v[0], v[16]);
1412 u[17] = _mm_sub_epi32(v[1], v[17]);
1413 u[18] = _mm_sub_epi32(v[2], v[18]);
1414 u[19] = _mm_sub_epi32(v[3], v[19]);
1415 u[20] = _mm_sub_epi32(v[4], v[20]);
1416 u[21] = _mm_sub_epi32(v[5], v[21]);
1417 u[22] = _mm_sub_epi32(v[6], v[22]);
1418 u[23] = _mm_sub_epi32(v[7], v[23]);
1419 u[24] = _mm_sub_epi32(v[8], v[24]);
1420 u[25] = _mm_sub_epi32(v[9], v[25]);
1421 u[26] = _mm_sub_epi32(v[10], v[26]);
1422 u[27] = _mm_sub_epi32(v[11], v[27]);
1423 u[28] = _mm_sub_epi32(v[12], v[28]);
1424 u[29] = _mm_sub_epi32(v[13], v[29]);
1425 u[30] = _mm_sub_epi32(v[14], v[30]);
1426 u[31] = _mm_sub_epi32(v[15], v[31]);
1428 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1429 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1430 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1431 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1432 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1433 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1434 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1435 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1436 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1437 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1438 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1439 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1440 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1441 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1442 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1443 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1444 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1445 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1446 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1447 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1448 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1449 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1450 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1451 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1452 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1453 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1454 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1455 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1456 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1457 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1458 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1459 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1461 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1462 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1463 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1464 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1465 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1466 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1467 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1468 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1469 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1470 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1471 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1472 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1473 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1474 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1475 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1476 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1477 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1478 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1479 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1480 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1481 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1482 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1483 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1484 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1485 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1486 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1487 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1488 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1489 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1490 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1491 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1492 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1494 s[0] = _mm_packs_epi32(u[0], u[1]);
1495 s[1] = _mm_packs_epi32(u[2], u[3]);
1496 s[2] = _mm_packs_epi32(u[4], u[5]);
1497 s[3] = _mm_packs_epi32(u[6], u[7]);
1498 s[4] = _mm_packs_epi32(u[8], u[9]);
1499 s[5] = _mm_packs_epi32(u[10], u[11]);
1500 s[6] = _mm_packs_epi32(u[12], u[13]);
1501 s[7] = _mm_packs_epi32(u[14], u[15]);
1502 s[8] = _mm_packs_epi32(u[16], u[17]);
1503 s[9] = _mm_packs_epi32(u[18], u[19]);
1504 s[10] = _mm_packs_epi32(u[20], u[21]);
1505 s[11] = _mm_packs_epi32(u[22], u[23]);
1506 s[12] = _mm_packs_epi32(u[24], u[25]);
1507 s[13] = _mm_packs_epi32(u[26], u[27]);
1508 s[14] = _mm_packs_epi32(u[28], u[29]);
1509 s[15] = _mm_packs_epi32(u[30], u[31]);
1512 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1513 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1514 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1515 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1516 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1517 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1518 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1519 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1521 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1522 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1523 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1524 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1525 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1526 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1527 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1528 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1529 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1530 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1531 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1532 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1533 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1534 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1535 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1536 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1538 u[0] = _mm_add_epi32(v[0], v[8]);
1539 u[1] = _mm_add_epi32(v[1], v[9]);
1540 u[2] = _mm_add_epi32(v[2], v[10]);
1541 u[3] = _mm_add_epi32(v[3], v[11]);
1542 u[4] = _mm_add_epi32(v[4], v[12]);
1543 u[5] = _mm_add_epi32(v[5], v[13]);
1544 u[6] = _mm_add_epi32(v[6], v[14]);
1545 u[7] = _mm_add_epi32(v[7], v[15]);
1546 u[8] = _mm_sub_epi32(v[0], v[8]);
1547 u[9] = _mm_sub_epi32(v[1], v[9]);
1548 u[10] = _mm_sub_epi32(v[2], v[10]);
1549 u[11] = _mm_sub_epi32(v[3], v[11]);
1550 u[12] = _mm_sub_epi32(v[4], v[12]);
1551 u[13] = _mm_sub_epi32(v[5], v[13]);
1552 u[14] = _mm_sub_epi32(v[6], v[14]);
1553 u[15] = _mm_sub_epi32(v[7], v[15]);
1555 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1556 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1557 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1558 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1559 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1560 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1561 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1562 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1563 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1564 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1565 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1566 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1567 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1568 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1569 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1570 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1572 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1573 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1574 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1575 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1576 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1577 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1578 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1579 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1580 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1581 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1582 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1583 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1584 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1585 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1586 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1587 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1589 x[0] = _mm_add_epi16(s[0], s[4]);
1590 x[1] = _mm_add_epi16(s[1], s[5]);
1591 x[2] = _mm_add_epi16(s[2], s[6]);
1592 x[3] = _mm_add_epi16(s[3], s[7]);
1593 x[4] = _mm_sub_epi16(s[0], s[4]);
1594 x[5] = _mm_sub_epi16(s[1], s[5]);
1595 x[6] = _mm_sub_epi16(s[2], s[6]);
1596 x[7] = _mm_sub_epi16(s[3], s[7]);
1597 x[8] = _mm_packs_epi32(u[0], u[1]);
1598 x[9] = _mm_packs_epi32(u[2], u[3]);
1599 x[10] = _mm_packs_epi32(u[4], u[5]);
1600 x[11] = _mm_packs_epi32(u[6], u[7]);
1601 x[12] = _mm_packs_epi32(u[8], u[9]);
1602 x[13] = _mm_packs_epi32(u[10], u[11]);
1603 x[14] = _mm_packs_epi32(u[12], u[13]);
1604 x[15] = _mm_packs_epi32(u[14], u[15]);
1607 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1608 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1609 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1610 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1611 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1612 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1613 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1614 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1616 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1617 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1618 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1619 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1620 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1621 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1622 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1623 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1624 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1625 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1626 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1627 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1628 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1629 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1630 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1631 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1633 u[0] = _mm_add_epi32(v[0], v[4]);
1634 u[1] = _mm_add_epi32(v[1], v[5]);
1635 u[2] = _mm_add_epi32(v[2], v[6]);
1636 u[3] = _mm_add_epi32(v[3], v[7]);
1637 u[4] = _mm_sub_epi32(v[0], v[4]);
1638 u[5] = _mm_sub_epi32(v[1], v[5]);
1639 u[6] = _mm_sub_epi32(v[2], v[6]);
1640 u[7] = _mm_sub_epi32(v[3], v[7]);
1641 u[8] = _mm_add_epi32(v[8], v[12]);
1642 u[9] = _mm_add_epi32(v[9], v[13]);
1643 u[10] = _mm_add_epi32(v[10], v[14]);
1644 u[11] = _mm_add_epi32(v[11], v[15]);
1645 u[12] = _mm_sub_epi32(v[8], v[12]);
1646 u[13] = _mm_sub_epi32(v[9], v[13]);
1647 u[14] = _mm_sub_epi32(v[10], v[14]);
1648 u[15] = _mm_sub_epi32(v[11], v[15]);
1650 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1651 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1652 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1653 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1654 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1655 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1656 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1657 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1658 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1659 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1660 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1661 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1662 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1663 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1664 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1665 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1667 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1668 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1669 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1670 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1671 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1672 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1673 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1674 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1675 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1676 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1677 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1678 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1679 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1680 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1681 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1682 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1684 s[0] = _mm_add_epi16(x[0], x[2]);
1685 s[1] = _mm_add_epi16(x[1], x[3]);
1686 s[2] = _mm_sub_epi16(x[0], x[2]);
1687 s[3] = _mm_sub_epi16(x[1], x[3]);
1688 s[4] = _mm_packs_epi32(v[0], v[1]);
1689 s[5] = _mm_packs_epi32(v[2], v[3]);
1690 s[6] = _mm_packs_epi32(v[4], v[5]);
1691 s[7] = _mm_packs_epi32(v[6], v[7]);
1692 s[8] = _mm_add_epi16(x[8], x[10]);
1693 s[9] = _mm_add_epi16(x[9], x[11]);
1694 s[10] = _mm_sub_epi16(x[8], x[10]);
1695 s[11] = _mm_sub_epi16(x[9], x[11]);
1696 s[12] = _mm_packs_epi32(v[8], v[9]);
1697 s[13] = _mm_packs_epi32(v[10], v[11]);
1698 s[14] = _mm_packs_epi32(v[12], v[13]);
1699 s[15] = _mm_packs_epi32(v[14], v[15]);
1702 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1703 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1704 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1705 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1706 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1707 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1708 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1709 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1711 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1712 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1713 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1714 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1715 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1716 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1717 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1718 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1719 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1720 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1721 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1722 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1723 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1724 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1725 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1726 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1728 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1729 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1730 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1731 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1732 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1733 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1734 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1735 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1736 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1737 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1738 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1739 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1740 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1741 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1742 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1743 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1745 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1746 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1747 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1748 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1749 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1750 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1751 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1752 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1753 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1754 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1755 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1756 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1757 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1758 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1759 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1760 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1763 in[1] = _mm_sub_epi16(kZero, s[8]);
1765 in[3] = _mm_sub_epi16(kZero, s[4]);
1766 in[4] = _mm_packs_epi32(v[4], v[5]);
1767 in[5] = _mm_packs_epi32(v[12], v[13]);
1768 in[6] = _mm_packs_epi32(v[8], v[9]);
1769 in[7] = _mm_packs_epi32(v[0], v[1]);
1770 in[8] = _mm_packs_epi32(v[2], v[3]);
1771 in[9] = _mm_packs_epi32(v[10], v[11]);
1772 in[10] = _mm_packs_epi32(v[14], v[15]);
1773 in[11] = _mm_packs_epi32(v[6], v[7]);
1775 in[13] = _mm_sub_epi16(kZero, s[13]);
1777 in[15] = _mm_sub_epi16(kZero, s[1]);
1780 static void idct16_8col(__m128i *in) {
1781 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1782 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1783 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1784 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1785 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1786 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1787 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1788 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1789 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1790 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1791 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1792 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1793 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1794 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1795 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1796 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1797 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1798 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1799 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1800 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1801 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1802 __m128i v[16], u[16], s[16], t[16];
1823 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1824 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1825 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1826 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1827 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1828 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1829 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1830 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1832 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
1833 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
1834 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
1835 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
1836 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
1837 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
1838 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
1839 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
1840 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
1841 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
1842 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
1843 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
1844 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
1845 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
1846 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
1847 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
1849 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1850 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1851 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1852 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1853 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1854 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1855 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1856 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1857 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1858 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1859 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1860 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1861 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1862 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1863 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1864 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1866 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1867 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1868 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1869 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1870 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1871 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1872 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1873 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1874 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1875 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1876 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1877 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1878 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1879 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1880 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1881 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1883 s[8] = _mm_packs_epi32(u[0], u[1]);
1884 s[15] = _mm_packs_epi32(u[2], u[3]);
1885 s[9] = _mm_packs_epi32(u[4], u[5]);
1886 s[14] = _mm_packs_epi32(u[6], u[7]);
1887 s[10] = _mm_packs_epi32(u[8], u[9]);
1888 s[13] = _mm_packs_epi32(u[10], u[11]);
1889 s[11] = _mm_packs_epi32(u[12], u[13]);
1890 s[12] = _mm_packs_epi32(u[14], u[15]);
1897 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1898 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1899 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1900 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1902 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1903 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1904 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1905 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1906 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1907 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1908 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1909 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1911 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1912 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1913 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1914 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1915 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1916 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1917 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1918 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1920 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1921 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1922 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1923 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1924 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1925 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1926 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1927 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1929 t[4] = _mm_packs_epi32(u[0], u[1]);
1930 t[7] = _mm_packs_epi32(u[2], u[3]);
1931 t[5] = _mm_packs_epi32(u[4], u[5]);
1932 t[6] = _mm_packs_epi32(u[6], u[7]);
1933 t[8] = _mm_add_epi16(s[8], s[9]);
1934 t[9] = _mm_sub_epi16(s[8], s[9]);
1935 t[10] = _mm_sub_epi16(s[11], s[10]);
1936 t[11] = _mm_add_epi16(s[10], s[11]);
1937 t[12] = _mm_add_epi16(s[12], s[13]);
1938 t[13] = _mm_sub_epi16(s[12], s[13]);
1939 t[14] = _mm_sub_epi16(s[15], s[14]);
1940 t[15] = _mm_add_epi16(s[14], s[15]);
1943 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1944 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1945 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1946 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1947 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1948 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1949 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1950 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1952 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
1953 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
1954 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1955 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1956 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
1957 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
1958 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1959 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1960 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
1961 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
1962 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
1963 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
1964 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
1965 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
1966 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
1967 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
1969 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1970 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1971 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1972 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1973 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1974 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1975 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1976 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1977 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1978 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1979 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1980 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1981 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1982 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1983 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1984 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1986 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1987 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1988 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1989 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1990 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1991 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1992 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1993 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1994 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1995 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1996 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1997 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1998 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1999 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
2000 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
2001 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
2003 s[0] = _mm_packs_epi32(u[0], u[1]);
2004 s[1] = _mm_packs_epi32(u[2], u[3]);
2005 s[2] = _mm_packs_epi32(u[4], u[5]);
2006 s[3] = _mm_packs_epi32(u[6], u[7]);
2007 s[4] = _mm_add_epi16(t[4], t[5]);
2008 s[5] = _mm_sub_epi16(t[4], t[5]);
2009 s[6] = _mm_sub_epi16(t[7], t[6]);
2010 s[7] = _mm_add_epi16(t[6], t[7]);
2013 s[9] = _mm_packs_epi32(u[8], u[9]);
2014 s[14] = _mm_packs_epi32(u[10], u[11]);
2015 s[10] = _mm_packs_epi32(u[12], u[13]);
2016 s[13] = _mm_packs_epi32(u[14], u[15]);
2021 t[0] = _mm_add_epi16(s[0], s[3]);
2022 t[1] = _mm_add_epi16(s[1], s[2]);
2023 t[2] = _mm_sub_epi16(s[1], s[2]);
2024 t[3] = _mm_sub_epi16(s[0], s[3]);
2028 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
2029 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
2030 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2031 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2032 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2033 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2034 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2035 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2036 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2037 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2038 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2039 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2040 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2041 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2042 t[5] = _mm_packs_epi32(u[0], u[1]);
2043 t[6] = _mm_packs_epi32(u[2], u[3]);
2045 t[8] = _mm_add_epi16(s[8], s[11]);
2046 t[9] = _mm_add_epi16(s[9], s[10]);
2047 t[10] = _mm_sub_epi16(s[9], s[10]);
2048 t[11] = _mm_sub_epi16(s[8], s[11]);
2049 t[12] = _mm_sub_epi16(s[15], s[12]);
2050 t[13] = _mm_sub_epi16(s[14], s[13]);
2051 t[14] = _mm_add_epi16(s[13], s[14]);
2052 t[15] = _mm_add_epi16(s[12], s[15]);
2055 s[0] = _mm_add_epi16(t[0], t[7]);
2056 s[1] = _mm_add_epi16(t[1], t[6]);
2057 s[2] = _mm_add_epi16(t[2], t[5]);
2058 s[3] = _mm_add_epi16(t[3], t[4]);
2059 s[4] = _mm_sub_epi16(t[3], t[4]);
2060 s[5] = _mm_sub_epi16(t[2], t[5]);
2061 s[6] = _mm_sub_epi16(t[1], t[6]);
2062 s[7] = _mm_sub_epi16(t[0], t[7]);
2066 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
2067 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
2068 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
2069 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
2071 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
2072 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
2073 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
2074 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
2075 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
2076 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
2077 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
2078 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
2080 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
2081 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
2082 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
2083 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
2084 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
2085 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
2086 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
2087 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
2089 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
2090 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
2091 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
2092 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
2093 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
2094 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
2095 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
2096 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
2098 s[10] = _mm_packs_epi32(u[0], u[1]);
2099 s[13] = _mm_packs_epi32(u[2], u[3]);
2100 s[11] = _mm_packs_epi32(u[4], u[5]);
2101 s[12] = _mm_packs_epi32(u[6], u[7]);
2106 in[0] = _mm_add_epi16(s[0], s[15]);
2107 in[1] = _mm_add_epi16(s[1], s[14]);
2108 in[2] = _mm_add_epi16(s[2], s[13]);
2109 in[3] = _mm_add_epi16(s[3], s[12]);
2110 in[4] = _mm_add_epi16(s[4], s[11]);
2111 in[5] = _mm_add_epi16(s[5], s[10]);
2112 in[6] = _mm_add_epi16(s[6], s[9]);
2113 in[7] = _mm_add_epi16(s[7], s[8]);
2114 in[8] = _mm_sub_epi16(s[7], s[8]);
2115 in[9] = _mm_sub_epi16(s[6], s[9]);
2116 in[10] = _mm_sub_epi16(s[5], s[10]);
2117 in[11] = _mm_sub_epi16(s[4], s[11]);
2118 in[12] = _mm_sub_epi16(s[3], s[12]);
2119 in[13] = _mm_sub_epi16(s[2], s[13]);
2120 in[14] = _mm_sub_epi16(s[1], s[14]);
2121 in[15] = _mm_sub_epi16(s[0], s[15]);
2124 void idct16_sse2(__m128i *in0, __m128i *in1) {
2125 array_transpose_16x16(in0, in1);
2130 void iadst16_sse2(__m128i *in0, __m128i *in1) {
2131 array_transpose_16x16(in0, in1);
2136 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
2138 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
2139 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2140 const __m128i zero = _mm_setzero_si128();
2142 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2143 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2144 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2145 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2147 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2148 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2150 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2151 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2152 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2153 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2154 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2155 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2157 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2158 __m128i in[16], l[16];
2159 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
2160 stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
2162 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2163 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
2164 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2166 // First 1-D inverse DCT
2168 in[0] = load_input_data(input);
2169 in[1] = load_input_data(input + 8 * 2);
2170 in[2] = load_input_data(input + 8 * 4);
2171 in[3] = load_input_data(input + 8 * 6);
2173 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
2177 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
2178 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
2180 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
2181 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
2182 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
2183 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
2185 tmp0 = _mm_add_epi32(tmp0, rounding);
2186 tmp2 = _mm_add_epi32(tmp2, rounding);
2187 tmp5 = _mm_add_epi32(tmp5, rounding);
2188 tmp7 = _mm_add_epi32(tmp7, rounding);
2190 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2191 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2192 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2193 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2195 stp2_8 = _mm_packs_epi32(tmp0, tmp2);
2196 stp2_11 = _mm_packs_epi32(tmp5, tmp7);
2201 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
2203 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
2204 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
2206 tmp0 = _mm_add_epi32(tmp0, rounding);
2207 tmp2 = _mm_add_epi32(tmp2, rounding);
2208 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2209 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2211 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
2212 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
2214 stp1_4 = _mm_packs_epi32(tmp0, tmp2);
2219 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
2220 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
2221 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
2223 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
2224 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
2225 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
2226 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
2227 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
2228 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
2230 tmp0 = _mm_add_epi32(tmp0, rounding);
2231 tmp2 = _mm_add_epi32(tmp2, rounding);
2232 tmp1 = _mm_add_epi32(tmp1, rounding);
2233 tmp3 = _mm_add_epi32(tmp3, rounding);
2234 tmp5 = _mm_add_epi32(tmp5, rounding);
2235 tmp7 = _mm_add_epi32(tmp7, rounding);
2237 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2238 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2239 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2240 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2241 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
2242 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
2244 stp1_0 = _mm_packs_epi32(tmp0, tmp0);
2245 stp1_1 = _mm_packs_epi32(tmp2, tmp2);
2246 stp2_9 = _mm_packs_epi32(tmp1, tmp3);
2247 stp2_10 = _mm_packs_epi32(tmp5, tmp7);
2249 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
2252 // Stage5 and Stage6
2254 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
2255 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
2256 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
2257 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
2259 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
2260 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
2261 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
2262 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
2264 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
2265 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
2266 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
2267 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
2272 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
2273 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
2274 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
2276 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
2277 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
2278 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
2279 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
2280 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
2281 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
2283 tmp1 = _mm_add_epi32(tmp1, rounding);
2284 tmp3 = _mm_add_epi32(tmp3, rounding);
2285 tmp0 = _mm_add_epi32(tmp0, rounding);
2286 tmp2 = _mm_add_epi32(tmp2, rounding);
2287 tmp4 = _mm_add_epi32(tmp4, rounding);
2288 tmp6 = _mm_add_epi32(tmp6, rounding);
2290 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
2291 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
2292 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
2293 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
2294 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
2295 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
2297 stp1_6 = _mm_packs_epi32(tmp3, tmp1);
2299 stp2_10 = _mm_packs_epi32(tmp0, zero);
2300 stp2_13 = _mm_packs_epi32(tmp2, zero);
2301 stp2_11 = _mm_packs_epi32(tmp4, zero);
2302 stp2_12 = _mm_packs_epi32(tmp6, zero);
2304 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
2305 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
2306 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
2307 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
2309 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
2310 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
2311 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
2312 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
2313 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
2314 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
2315 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
2316 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
2319 // Stage7. Left 8x16 only.
2320 l[0] = _mm_add_epi16(stp2_0, stp1_15);
2321 l[1] = _mm_add_epi16(stp2_1, stp1_14);
2322 l[2] = _mm_add_epi16(stp2_2, stp2_13);
2323 l[3] = _mm_add_epi16(stp2_3, stp2_12);
2324 l[4] = _mm_add_epi16(stp2_4, stp2_11);
2325 l[5] = _mm_add_epi16(stp2_5, stp2_10);
2326 l[6] = _mm_add_epi16(stp2_6, stp1_9);
2327 l[7] = _mm_add_epi16(stp2_7, stp1_8);
2328 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
2329 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
2330 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
2331 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
2332 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
2333 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
2334 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
2335 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
2337 // Second 1-D inverse transform, performed per 8x16 block
2338 for (i = 0; i < 2; i++) {
2340 array_transpose_4X8(l + 8 * i, in);
2345 in[0] = _mm_add_epi16(stp2_0, stp1_15);
2346 in[1] = _mm_add_epi16(stp2_1, stp1_14);
2347 in[2] = _mm_add_epi16(stp2_2, stp2_13);
2348 in[3] = _mm_add_epi16(stp2_3, stp2_12);
2349 in[4] = _mm_add_epi16(stp2_4, stp2_11);
2350 in[5] = _mm_add_epi16(stp2_5, stp2_10);
2351 in[6] = _mm_add_epi16(stp2_6, stp1_9);
2352 in[7] = _mm_add_epi16(stp2_7, stp1_8);
2353 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
2354 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
2355 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
2356 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
2357 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
2358 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
2359 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
2360 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
2362 for (j = 0; j < 16; ++j) {
2363 // Final rounding and shift
2364 in[j] = _mm_adds_epi16(in[j], final_rounding);
2365 in[j] = _mm_srai_epi16(in[j], 6);
2366 RECON_AND_STORE(dest + j * stride, in[j]);
2373 #define LOAD_DQCOEFF(reg, input) \
2375 reg = load_input_data(input); \
2382 const __m128i zero = _mm_setzero_si128(); \
2383 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
2384 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
2386 const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
2387 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
2389 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
2390 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
2392 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
2393 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
2395 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
2397 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
2399 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
2401 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
2407 const __m128i zero = _mm_setzero_si128(); \
2408 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
2409 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
2411 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
2412 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
2414 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
2416 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
2419 stp2_16 = stp1_16; \
2420 stp2_19 = stp1_19; \
2422 stp2_20 = stp1_20; \
2423 stp2_23 = stp1_23; \
2425 stp2_24 = stp1_24; \
2426 stp2_27 = stp1_27; \
2428 stp2_28 = stp1_28; \
2429 stp2_31 = stp1_31; \
2434 const __m128i zero = _mm_setzero_si128(); \
2435 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
2436 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
2438 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
2439 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
2440 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
2441 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
2443 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
2444 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
2445 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
2446 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
2448 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
2452 stp1_11 = stp2_11; \
2453 stp1_12 = stp2_12; \
2454 stp1_15 = stp2_15; \
2456 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2457 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
2459 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2460 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2463 stp1_16 = stp2_16; \
2464 stp1_31 = stp2_31; \
2465 stp1_19 = stp2_19; \
2466 stp1_20 = stp2_20; \
2467 stp1_23 = stp2_23; \
2468 stp1_24 = stp2_24; \
2469 stp1_27 = stp2_27; \
2470 stp1_28 = stp2_28; \
2475 const __m128i zero = _mm_setzero_si128(); \
2476 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
2477 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
2479 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
2480 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
2481 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
2482 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
2484 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
2492 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2493 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
2497 stp2_15 = stp1_15; \
2498 stp2_11 = stp1_11; \
2499 stp2_12 = stp1_12; \
2501 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2502 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2503 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2504 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2505 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2506 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2507 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2508 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2510 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2511 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2512 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2513 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2514 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2515 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2516 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2517 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2522 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2523 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2524 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2525 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2527 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2528 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2529 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2530 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2532 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2533 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2540 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2541 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2542 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2543 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2545 tmp0 = _mm_add_epi32(tmp0, rounding); \
2546 tmp1 = _mm_add_epi32(tmp1, rounding); \
2547 tmp2 = _mm_add_epi32(tmp2, rounding); \
2548 tmp3 = _mm_add_epi32(tmp3, rounding); \
2550 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2551 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2552 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2553 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2555 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2556 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2561 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2562 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2563 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2564 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2565 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2566 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2567 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2568 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2570 stp1_16 = stp2_16; \
2571 stp1_17 = stp2_17; \
2573 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2574 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
2576 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2577 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
2580 stp1_22 = stp2_22; \
2581 stp1_23 = stp2_23; \
2582 stp1_24 = stp2_24; \
2583 stp1_25 = stp2_25; \
2584 stp1_30 = stp2_30; \
2585 stp1_31 = stp2_31; \
2590 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2591 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2592 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2593 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2595 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2596 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2597 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2598 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2599 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2600 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2601 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2602 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2606 stp2_14 = stp1_14; \
2607 stp2_15 = stp1_15; \
2609 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
2610 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
2613 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2614 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2615 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2616 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2617 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2618 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2619 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2620 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2622 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2623 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2624 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2625 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2626 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2627 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2628 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2629 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2634 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2635 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2636 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2637 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2639 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2640 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2641 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2642 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2644 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2645 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2646 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2647 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2648 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2649 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2650 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2651 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2652 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2653 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2654 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2655 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2656 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2657 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2658 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2659 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2661 stp1_16 = stp2_16; \
2662 stp1_17 = stp2_17; \
2663 stp1_18 = stp2_18; \
2664 stp1_19 = stp2_19; \
2666 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2667 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
2669 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2670 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
2673 stp1_28 = stp2_28; \
2674 stp1_29 = stp2_29; \
2675 stp1_30 = stp2_30; \
2676 stp1_31 = stp2_31; \
2682 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2683 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2684 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2685 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2687 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2688 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2689 const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
2690 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2692 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2693 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2694 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2695 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2697 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2698 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2699 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2700 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2702 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2703 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
2705 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
2706 stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
2707 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2708 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2710 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2711 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2717 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2718 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2719 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2720 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2722 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2723 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2724 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2725 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2727 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2728 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2730 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2731 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
2734 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2735 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2736 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2737 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2739 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2740 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2741 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2742 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2744 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2745 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2746 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2747 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2749 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2750 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2751 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2752 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2757 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2758 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2759 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2760 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2762 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2763 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2764 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2765 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2767 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2768 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2769 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2770 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2772 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2773 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2776 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2777 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2778 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2779 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2780 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2781 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2782 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2783 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2785 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2786 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
2788 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2789 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2792 stp1_16 = stp2_16; \
2793 stp1_31 = stp2_31; \
2794 stp1_19 = stp2_19; \
2795 stp1_20 = stp2_20; \
2796 stp1_23 = stp2_23; \
2797 stp1_24 = stp2_24; \
2798 stp1_27 = stp2_27; \
2799 stp1_28 = stp2_28; \
2804 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2805 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2806 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2807 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2809 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2810 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2811 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2812 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2814 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
2815 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
2817 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2818 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2819 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2820 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2822 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2823 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
2827 stp2_15 = stp1_15; \
2828 stp2_11 = stp1_11; \
2829 stp2_12 = stp1_12; \
2831 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2832 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2833 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2834 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2835 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2836 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2837 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2838 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2840 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2841 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2842 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2843 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2844 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2845 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2846 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2847 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2852 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2853 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2854 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2855 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2857 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2858 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2859 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2860 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2862 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2863 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2865 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2866 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2867 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2868 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2870 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
2871 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
2872 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
2873 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
2875 tmp0 = _mm_add_epi32(tmp0, rounding); \
2876 tmp1 = _mm_add_epi32(tmp1, rounding); \
2877 tmp2 = _mm_add_epi32(tmp2, rounding); \
2878 tmp3 = _mm_add_epi32(tmp3, rounding); \
2880 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
2881 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
2882 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
2883 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
2885 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
2886 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
2891 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2892 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2893 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2894 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2895 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2896 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2897 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2898 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2900 stp1_16 = stp2_16; \
2901 stp1_17 = stp2_17; \
2903 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2904 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
2906 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2907 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
2910 stp1_22 = stp2_22; \
2911 stp1_23 = stp2_23; \
2912 stp1_24 = stp2_24; \
2913 stp1_25 = stp2_25; \
2914 stp1_30 = stp2_30; \
2915 stp1_31 = stp2_31; \
2920 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2921 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2922 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2923 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2925 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2926 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2927 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2928 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2929 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2930 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2931 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2932 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2936 stp2_14 = stp1_14; \
2937 stp2_15 = stp1_15; \
2939 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
2940 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
2943 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2944 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2945 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2946 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2947 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2948 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2949 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2950 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2952 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2953 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2954 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2955 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2956 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2957 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2958 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2959 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2964 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2965 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2966 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2967 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2969 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2970 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2971 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2972 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2974 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2975 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2976 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2977 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2978 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2979 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2980 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2981 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2982 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2983 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2984 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2985 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2986 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2987 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2988 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2989 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2991 stp1_16 = stp2_16; \
2992 stp1_17 = stp2_17; \
2993 stp1_18 = stp2_18; \
2994 stp1_19 = stp2_19; \
2996 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2997 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
2999 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
3000 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
3003 stp1_28 = stp2_28; \
3004 stp1_29 = stp2_29; \
3005 stp1_30 = stp2_30; \
3006 stp1_31 = stp2_31; \
3009 // Only upper-left 8x8 has non-zero coeff
3010 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
3012 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3013 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3015 // idct constants for each stage
3016 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3017 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3018 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3019 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3020 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3021 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3022 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3023 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3025 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3026 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3027 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3028 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3030 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3031 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3032 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3033 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3034 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3035 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3036 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3037 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3039 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3040 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3041 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3042 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3043 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3045 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3047 __m128i in[32], col[32];
3048 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3049 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3050 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
3051 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
3052 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3053 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3054 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
3055 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
3056 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3059 // Load input data. Only need to load the top left 8x8 block.
3060 in[0] = load_input_data(input);
3061 in[1] = load_input_data(input + 32);
3062 in[2] = load_input_data(input + 64);
3063 in[3] = load_input_data(input + 96);
3064 in[4] = load_input_data(input + 128);
3065 in[5] = load_input_data(input + 160);
3066 in[6] = load_input_data(input + 192);
3067 in[7] = load_input_data(input + 224);
3069 for (i = 8; i < 32; ++i) {
3070 in[i] = _mm_setzero_si128();
3073 array_transpose_8x8(in, in);
3074 // TODO(hkuang): Following transposes are unnecessary. But remove them will
3075 // lead to performance drop on some devices.
3076 array_transpose_8x8(in + 8, in + 8);
3077 array_transpose_8x8(in + 16, in + 16);
3078 array_transpose_8x8(in + 24, in + 24);
3082 // 1_D: Store 32 intermediate results for each 8x32 block.
3083 col[0] = _mm_add_epi16(stp1_0, stp1_31);
3084 col[1] = _mm_add_epi16(stp1_1, stp1_30);
3085 col[2] = _mm_add_epi16(stp1_2, stp1_29);
3086 col[3] = _mm_add_epi16(stp1_3, stp1_28);
3087 col[4] = _mm_add_epi16(stp1_4, stp1_27);
3088 col[5] = _mm_add_epi16(stp1_5, stp1_26);
3089 col[6] = _mm_add_epi16(stp1_6, stp1_25);
3090 col[7] = _mm_add_epi16(stp1_7, stp1_24);
3091 col[8] = _mm_add_epi16(stp1_8, stp1_23);
3092 col[9] = _mm_add_epi16(stp1_9, stp1_22);
3093 col[10] = _mm_add_epi16(stp1_10, stp1_21);
3094 col[11] = _mm_add_epi16(stp1_11, stp1_20);
3095 col[12] = _mm_add_epi16(stp1_12, stp1_19);
3096 col[13] = _mm_add_epi16(stp1_13, stp1_18);
3097 col[14] = _mm_add_epi16(stp1_14, stp1_17);
3098 col[15] = _mm_add_epi16(stp1_15, stp1_16);
3099 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
3100 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
3101 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
3102 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
3103 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
3104 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
3105 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
3106 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
3107 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
3108 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
3109 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
3110 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
3111 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
3112 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
3113 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
3114 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
3115 for (i = 0; i < 4; i++) {
3117 const __m128i zero = _mm_setzero_si128();
3118 // Transpose 32x8 block to 8x32 block
3119 array_transpose_8x8(col + i * 8, in);
3122 // 2_D: Calculate the results and store them to destination.
3123 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3124 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3125 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3126 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3127 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3128 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3129 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3130 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3131 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3132 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3133 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3134 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3135 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3136 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3137 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3138 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3139 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3140 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3141 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3142 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3143 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3144 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3145 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3146 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3147 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3148 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3149 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3150 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3151 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3152 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3153 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3154 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3156 for (j = 0; j < 32; ++j) {
3157 // Final rounding and shift
3158 in[j] = _mm_adds_epi16(in[j], final_rounding);
3159 in[j] = _mm_srai_epi16(in[j], 6);
3160 RECON_AND_STORE(dest + j * stride, in[j]);
3167 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
3169 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
3170 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
3171 const __m128i zero = _mm_setzero_si128();
3173 // idct constants for each stage
3174 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
3175 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
3176 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
3177 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
3178 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
3179 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
3180 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
3181 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
3182 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
3183 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
3184 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
3185 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
3186 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
3187 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
3188 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
3189 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
3191 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
3192 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
3193 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
3194 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
3195 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
3196 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
3197 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
3198 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
3200 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
3201 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
3202 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
3203 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
3204 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
3205 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
3206 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
3207 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
3208 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
3209 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
3211 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
3212 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
3213 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
3214 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
3215 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
3216 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
3217 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
3219 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
3221 __m128i in[32], col[128], zero_idx[16];
3222 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
3223 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
3224 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
3225 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
3226 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
3227 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
3228 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
3229 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
3230 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3233 for (i = 0; i < 4; i++) {
3237 LOAD_DQCOEFF(in[0], input);
3238 LOAD_DQCOEFF(in[8], input);
3239 LOAD_DQCOEFF(in[16], input);
3240 LOAD_DQCOEFF(in[24], input);
3241 LOAD_DQCOEFF(in[1], input);
3242 LOAD_DQCOEFF(in[9], input);
3243 LOAD_DQCOEFF(in[17], input);
3244 LOAD_DQCOEFF(in[25], input);
3245 LOAD_DQCOEFF(in[2], input);
3246 LOAD_DQCOEFF(in[10], input);
3247 LOAD_DQCOEFF(in[18], input);
3248 LOAD_DQCOEFF(in[26], input);
3249 LOAD_DQCOEFF(in[3], input);
3250 LOAD_DQCOEFF(in[11], input);
3251 LOAD_DQCOEFF(in[19], input);
3252 LOAD_DQCOEFF(in[27], input);
3254 LOAD_DQCOEFF(in[4], input);
3255 LOAD_DQCOEFF(in[12], input);
3256 LOAD_DQCOEFF(in[20], input);
3257 LOAD_DQCOEFF(in[28], input);
3258 LOAD_DQCOEFF(in[5], input);
3259 LOAD_DQCOEFF(in[13], input);
3260 LOAD_DQCOEFF(in[21], input);
3261 LOAD_DQCOEFF(in[29], input);
3262 LOAD_DQCOEFF(in[6], input);
3263 LOAD_DQCOEFF(in[14], input);
3264 LOAD_DQCOEFF(in[22], input);
3265 LOAD_DQCOEFF(in[30], input);
3266 LOAD_DQCOEFF(in[7], input);
3267 LOAD_DQCOEFF(in[15], input);
3268 LOAD_DQCOEFF(in[23], input);
3269 LOAD_DQCOEFF(in[31], input);
3271 // checking if all entries are zero
3272 zero_idx[0] = _mm_or_si128(in[0], in[1]);
3273 zero_idx[1] = _mm_or_si128(in[2], in[3]);
3274 zero_idx[2] = _mm_or_si128(in[4], in[5]);
3275 zero_idx[3] = _mm_or_si128(in[6], in[7]);
3276 zero_idx[4] = _mm_or_si128(in[8], in[9]);
3277 zero_idx[5] = _mm_or_si128(in[10], in[11]);
3278 zero_idx[6] = _mm_or_si128(in[12], in[13]);
3279 zero_idx[7] = _mm_or_si128(in[14], in[15]);
3280 zero_idx[8] = _mm_or_si128(in[16], in[17]);
3281 zero_idx[9] = _mm_or_si128(in[18], in[19]);
3282 zero_idx[10] = _mm_or_si128(in[20], in[21]);
3283 zero_idx[11] = _mm_or_si128(in[22], in[23]);
3284 zero_idx[12] = _mm_or_si128(in[24], in[25]);
3285 zero_idx[13] = _mm_or_si128(in[26], in[27]);
3286 zero_idx[14] = _mm_or_si128(in[28], in[29]);
3287 zero_idx[15] = _mm_or_si128(in[30], in[31]);
3289 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3290 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3291 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3292 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3293 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3294 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3295 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3296 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
3298 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
3299 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
3300 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
3301 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
3302 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
3303 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
3304 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
3306 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
3307 col[i32 + 0] = _mm_setzero_si128();
3308 col[i32 + 1] = _mm_setzero_si128();
3309 col[i32 + 2] = _mm_setzero_si128();
3310 col[i32 + 3] = _mm_setzero_si128();
3311 col[i32 + 4] = _mm_setzero_si128();
3312 col[i32 + 5] = _mm_setzero_si128();
3313 col[i32 + 6] = _mm_setzero_si128();
3314 col[i32 + 7] = _mm_setzero_si128();
3315 col[i32 + 8] = _mm_setzero_si128();
3316 col[i32 + 9] = _mm_setzero_si128();
3317 col[i32 + 10] = _mm_setzero_si128();
3318 col[i32 + 11] = _mm_setzero_si128();
3319 col[i32 + 12] = _mm_setzero_si128();
3320 col[i32 + 13] = _mm_setzero_si128();
3321 col[i32 + 14] = _mm_setzero_si128();
3322 col[i32 + 15] = _mm_setzero_si128();
3323 col[i32 + 16] = _mm_setzero_si128();
3324 col[i32 + 17] = _mm_setzero_si128();
3325 col[i32 + 18] = _mm_setzero_si128();
3326 col[i32 + 19] = _mm_setzero_si128();
3327 col[i32 + 20] = _mm_setzero_si128();
3328 col[i32 + 21] = _mm_setzero_si128();
3329 col[i32 + 22] = _mm_setzero_si128();
3330 col[i32 + 23] = _mm_setzero_si128();
3331 col[i32 + 24] = _mm_setzero_si128();
3332 col[i32 + 25] = _mm_setzero_si128();
3333 col[i32 + 26] = _mm_setzero_si128();
3334 col[i32 + 27] = _mm_setzero_si128();
3335 col[i32 + 28] = _mm_setzero_si128();
3336 col[i32 + 29] = _mm_setzero_si128();
3337 col[i32 + 30] = _mm_setzero_si128();
3338 col[i32 + 31] = _mm_setzero_si128();
3342 // Transpose 32x8 block to 8x32 block
3343 array_transpose_8x8(in, in);
3344 array_transpose_8x8(in + 8, in + 8);
3345 array_transpose_8x8(in + 16, in + 16);
3346 array_transpose_8x8(in + 24, in + 24);
3350 // 1_D: Store 32 intermediate results for each 8x32 block.
3351 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
3352 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
3353 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
3354 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
3355 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
3356 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
3357 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
3358 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
3359 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
3360 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
3361 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
3362 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
3363 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
3364 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
3365 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
3366 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
3367 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
3368 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
3369 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
3370 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
3371 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
3372 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
3373 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
3374 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
3375 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
3376 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
3377 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
3378 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
3379 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
3380 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
3381 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
3382 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
3384 for (i = 0; i < 4; i++) {
3388 // Transpose 32x8 block to 8x32 block
3389 array_transpose_8x8(col + j, in);
3390 array_transpose_8x8(col + j + 32, in + 8);
3391 array_transpose_8x8(col + j + 64, in + 16);
3392 array_transpose_8x8(col + j + 96, in + 24);
3396 // 2_D: Calculate the results and store them to destination.
3397 in[0] = _mm_add_epi16(stp1_0, stp1_31);
3398 in[1] = _mm_add_epi16(stp1_1, stp1_30);
3399 in[2] = _mm_add_epi16(stp1_2, stp1_29);
3400 in[3] = _mm_add_epi16(stp1_3, stp1_28);
3401 in[4] = _mm_add_epi16(stp1_4, stp1_27);
3402 in[5] = _mm_add_epi16(stp1_5, stp1_26);
3403 in[6] = _mm_add_epi16(stp1_6, stp1_25);
3404 in[7] = _mm_add_epi16(stp1_7, stp1_24);
3405 in[8] = _mm_add_epi16(stp1_8, stp1_23);
3406 in[9] = _mm_add_epi16(stp1_9, stp1_22);
3407 in[10] = _mm_add_epi16(stp1_10, stp1_21);
3408 in[11] = _mm_add_epi16(stp1_11, stp1_20);
3409 in[12] = _mm_add_epi16(stp1_12, stp1_19);
3410 in[13] = _mm_add_epi16(stp1_13, stp1_18);
3411 in[14] = _mm_add_epi16(stp1_14, stp1_17);
3412 in[15] = _mm_add_epi16(stp1_15, stp1_16);
3413 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
3414 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
3415 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
3416 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
3417 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
3418 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
3419 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
3420 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
3421 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
3422 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
3423 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
3424 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
3425 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
3426 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
3427 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
3428 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
3430 for (j = 0; j < 32; ++j) {
3431 // Final rounding and shift
3432 in[j] = _mm_adds_epi16(in[j], final_rounding);
3433 in[j] = _mm_srai_epi16(in[j], 6);
3434 RECON_AND_STORE(dest + j * stride, in[j]);
3441 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
3444 const __m128i zero = _mm_setzero_si128();
3447 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
3448 a = (int)dct_const_round_shift(a * cospi_16_64);
3449 a = ROUND_POWER_OF_TWO(a, 6);
3451 dc_value = _mm_set1_epi16(a);
3453 for (j = 0; j < 32; ++j) {
3454 RECON_AND_STORE(dest + 0 + j * stride, dc_value);
3455 RECON_AND_STORE(dest + 8 + j * stride, dc_value);
3456 RECON_AND_STORE(dest + 16 + j * stride, dc_value);
3457 RECON_AND_STORE(dest + 24 + j * stride, dc_value);
3461 #if CONFIG_VP9_HIGHBITDEPTH
3462 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
3463 __m128i ubounded, retval;
3464 const __m128i zero = _mm_set1_epi16(0);
3465 const __m128i one = _mm_set1_epi16(1);
3466 const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
3467 ubounded = _mm_cmpgt_epi16(value, max);
3468 retval = _mm_andnot_si128(ubounded, value);
3469 ubounded = _mm_and_si128(ubounded, max);
3470 retval = _mm_or_si128(retval, ubounded);
3471 retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
3475 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
3476 int stride, int bd) {
3477 tran_low_t out[4 * 4];
3478 tran_low_t *outptr = out;
3481 __m128i sign_bits[2];
3482 __m128i temp_mm, min_input, max_input;
3484 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3485 int optimised_cols = 0;
3486 const __m128i zero = _mm_set1_epi16(0);
3487 const __m128i eight = _mm_set1_epi16(8);
3488 const __m128i max = _mm_set1_epi16(12043);
3489 const __m128i min = _mm_set1_epi16(-12043);
3490 // Load input into __m128i
3491 inptr[0] = _mm_loadu_si128((const __m128i *)input);
3492 inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
3493 inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
3494 inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
3497 inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
3498 inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
3500 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3501 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3502 max_input = _mm_cmpgt_epi16(max_input, max);
3503 min_input = _mm_cmplt_epi16(min_input, min);
3504 temp_mm = _mm_or_si128(max_input, min_input);
3505 test = _mm_movemask_epi8(temp_mm);
3508 // Do the row transform
3511 // Check the min & max values
3512 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3513 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3514 max_input = _mm_cmpgt_epi16(max_input, max);
3515 min_input = _mm_cmplt_epi16(min_input, min);
3516 temp_mm = _mm_or_si128(max_input, min_input);
3517 test = _mm_movemask_epi8(temp_mm);
3520 transpose_4x4(inptr);
3521 sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
3522 sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
3523 inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
3524 inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
3525 inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
3526 inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
3527 _mm_storeu_si128((__m128i *)outptr, inptr[0]);
3528 _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
3529 _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
3530 _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
3532 // Set to use the optimised transform for the column
3536 // Run the un-optimised row transform
3537 for (i = 0; i < 4; ++i) {
3538 vpx_highbd_idct4_c(input, outptr, bd);
3544 if (optimised_cols) {
3547 // Final round and shift
3548 inptr[0] = _mm_add_epi16(inptr[0], eight);
3549 inptr[1] = _mm_add_epi16(inptr[1], eight);
3551 inptr[0] = _mm_srai_epi16(inptr[0], 4);
3552 inptr[1] = _mm_srai_epi16(inptr[1], 4);
3554 // Reconstruction and Store
3556 __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
3557 __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
3558 d0 = _mm_unpacklo_epi64(
3559 d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
3560 d2 = _mm_unpacklo_epi64(
3561 d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
3562 d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
3563 d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
3565 _mm_storel_epi64((__m128i *)dest, d0);
3567 d0 = _mm_srli_si128(d0, 8);
3568 _mm_storel_epi64((__m128i *)(dest + stride), d0);
3570 _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
3572 d2 = _mm_srli_si128(d2, 8);
3573 _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
3576 // Run the un-optimised column transform
3577 tran_low_t temp_in[4], temp_out[4];
3579 for (i = 0; i < 4; ++i) {
3580 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
3581 vpx_highbd_idct4_c(temp_in, temp_out, bd);
3582 for (j = 0; j < 4; ++j) {
3583 dest[j * stride + i] = highbd_clip_pixel_add(
3584 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
3590 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
3591 int stride, int bd) {
3592 tran_low_t out[8 * 8];
3593 tran_low_t *outptr = out;
3596 __m128i min_input, max_input, temp1, temp2, sign_bits;
3597 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3598 const __m128i zero = _mm_set1_epi16(0);
3599 const __m128i sixteen = _mm_set1_epi16(16);
3600 const __m128i max = _mm_set1_epi16(6201);
3601 const __m128i min = _mm_set1_epi16(-6201);
3602 int optimised_cols = 0;
3604 // Load input into __m128i & pack to 16 bits
3605 for (i = 0; i < 8; i++) {
3606 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3607 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3608 inptr[i] = _mm_packs_epi32(temp1, temp2);
3611 // Find the min & max for the row transform
3612 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3613 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3614 for (i = 2; i < 8; i++) {
3615 max_input = _mm_max_epi16(max_input, inptr[i]);
3616 min_input = _mm_min_epi16(min_input, inptr[i]);
3618 max_input = _mm_cmpgt_epi16(max_input, max);
3619 min_input = _mm_cmplt_epi16(min_input, min);
3620 temp1 = _mm_or_si128(max_input, min_input);
3621 test = _mm_movemask_epi8(temp1);
3624 // Do the row transform
3627 // Find the min & max for the column transform
3628 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3629 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3630 for (i = 2; i < 8; i++) {
3631 max_input = _mm_max_epi16(max_input, inptr[i]);
3632 min_input = _mm_min_epi16(min_input, inptr[i]);
3634 max_input = _mm_cmpgt_epi16(max_input, max);
3635 min_input = _mm_cmplt_epi16(min_input, min);
3636 temp1 = _mm_or_si128(max_input, min_input);
3637 test = _mm_movemask_epi8(temp1);
3640 array_transpose_8x8(inptr, inptr);
3641 for (i = 0; i < 8; i++) {
3642 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3643 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3644 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3645 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3646 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3649 // Set to use the optimised transform for the column
3653 // Run the un-optimised row transform
3654 for (i = 0; i < 8; ++i) {
3655 vpx_highbd_idct8_c(input, outptr, bd);
3661 if (optimised_cols) {
3664 // Final round & shift and Reconstruction and Store
3667 for (i = 0; i < 8; i++) {
3668 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3669 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
3670 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3671 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3673 _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
3677 // Run the un-optimised column transform
3678 tran_low_t temp_in[8], temp_out[8];
3679 for (i = 0; i < 8; ++i) {
3680 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
3681 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3682 for (j = 0; j < 8; ++j) {
3683 dest[j * stride + i] = highbd_clip_pixel_add(
3684 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3690 void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3691 int stride, int bd) {
3692 tran_low_t out[8 * 8] = { 0 };
3693 tran_low_t *outptr = out;
3696 __m128i min_input, max_input, temp1, temp2, sign_bits;
3697 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3698 const __m128i zero = _mm_set1_epi16(0);
3699 const __m128i sixteen = _mm_set1_epi16(16);
3700 const __m128i max = _mm_set1_epi16(6201);
3701 const __m128i min = _mm_set1_epi16(-6201);
3702 int optimised_cols = 0;
3704 // Load input into __m128i & pack to 16 bits
3705 for (i = 0; i < 8; i++) {
3706 temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
3707 temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
3708 inptr[i] = _mm_packs_epi32(temp1, temp2);
3711 // Find the min & max for the row transform
3712 // only first 4 row has non-zero coefs
3713 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3714 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3715 for (i = 2; i < 4; i++) {
3716 max_input = _mm_max_epi16(max_input, inptr[i]);
3717 min_input = _mm_min_epi16(min_input, inptr[i]);
3719 max_input = _mm_cmpgt_epi16(max_input, max);
3720 min_input = _mm_cmplt_epi16(min_input, min);
3721 temp1 = _mm_or_si128(max_input, min_input);
3722 test = _mm_movemask_epi8(temp1);
3725 // Do the row transform
3728 // Find the min & max for the column transform
3729 // N.B. Only first 4 cols contain non-zero coeffs
3730 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3731 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3732 for (i = 2; i < 8; i++) {
3733 max_input = _mm_max_epi16(max_input, inptr[i]);
3734 min_input = _mm_min_epi16(min_input, inptr[i]);
3736 max_input = _mm_cmpgt_epi16(max_input, max);
3737 min_input = _mm_cmplt_epi16(min_input, min);
3738 temp1 = _mm_or_si128(max_input, min_input);
3739 test = _mm_movemask_epi8(temp1);
3742 // Use fact only first 4 rows contain non-zero coeffs
3743 array_transpose_4X8(inptr, inptr);
3744 for (i = 0; i < 4; i++) {
3745 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3746 temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3747 temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3748 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
3749 _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
3752 // Set to use the optimised transform for the column
3756 // Run the un-optimised row transform
3757 for (i = 0; i < 4; ++i) {
3758 vpx_highbd_idct8_c(input, outptr, bd);
3764 if (optimised_cols) {
3767 // Final round & shift and Reconstruction and Store
3770 for (i = 0; i < 8; i++) {
3771 inptr[i] = _mm_add_epi16(inptr[i], sixteen);
3772 d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
3773 inptr[i] = _mm_srai_epi16(inptr[i], 5);
3774 d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
3776 _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
3780 // Run the un-optimised column transform
3781 tran_low_t temp_in[8], temp_out[8];
3782 for (i = 0; i < 8; ++i) {
3783 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
3784 vpx_highbd_idct8_c(temp_in, temp_out, bd);
3785 for (j = 0; j < 8; ++j) {
3786 dest[j * stride + i] = highbd_clip_pixel_add(
3787 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
3793 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
3794 int stride, int bd) {
3795 tran_low_t out[16 * 16];
3796 tran_low_t *outptr = out;
3799 __m128i min_input, max_input, temp1, temp2, sign_bits;
3800 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3801 const __m128i zero = _mm_set1_epi16(0);
3802 const __m128i rounding = _mm_set1_epi16(32);
3803 const __m128i max = _mm_set1_epi16(3155);
3804 const __m128i min = _mm_set1_epi16(-3155);
3805 int optimised_cols = 0;
3807 // Load input into __m128i & pack to 16 bits
3808 for (i = 0; i < 16; i++) {
3809 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3810 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3811 inptr[i] = _mm_packs_epi32(temp1, temp2);
3812 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3813 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3814 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3817 // Find the min & max for the row transform
3818 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3819 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3820 for (i = 2; i < 32; i++) {
3821 max_input = _mm_max_epi16(max_input, inptr[i]);
3822 min_input = _mm_min_epi16(min_input, inptr[i]);
3824 max_input = _mm_cmpgt_epi16(max_input, max);
3825 min_input = _mm_cmplt_epi16(min_input, min);
3826 temp1 = _mm_or_si128(max_input, min_input);
3827 test = _mm_movemask_epi8(temp1);
3830 // Do the row transform
3831 idct16_sse2(inptr, inptr + 16);
3833 // Find the min & max for the column transform
3834 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3835 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3836 for (i = 2; i < 32; i++) {
3837 max_input = _mm_max_epi16(max_input, inptr[i]);
3838 min_input = _mm_min_epi16(min_input, inptr[i]);
3840 max_input = _mm_cmpgt_epi16(max_input, max);
3841 min_input = _mm_cmplt_epi16(min_input, min);
3842 temp1 = _mm_or_si128(max_input, min_input);
3843 test = _mm_movemask_epi8(temp1);
3846 array_transpose_16x16(inptr, inptr + 16);
3847 for (i = 0; i < 16; i++) {
3848 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3849 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3850 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3851 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3852 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3853 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3854 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3855 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3856 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3857 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3860 // Set to use the optimised transform for the column
3864 // Run the un-optimised row transform
3865 for (i = 0; i < 16; ++i) {
3866 vpx_highbd_idct16_c(input, outptr, bd);
3872 if (optimised_cols) {
3873 idct16_sse2(inptr, inptr + 16);
3875 // Final round & shift and Reconstruction and Store
3878 for (i = 0; i < 16; i++) {
3879 inptr[i] = _mm_add_epi16(inptr[i], rounding);
3880 inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
3881 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
3882 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
3883 inptr[i] = _mm_srai_epi16(inptr[i], 6);
3884 inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
3885 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
3886 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
3888 _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
3889 _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
3893 // Run the un-optimised column transform
3894 tran_low_t temp_in[16], temp_out[16];
3895 for (i = 0; i < 16; ++i) {
3896 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
3897 vpx_highbd_idct16_c(temp_in, temp_out, bd);
3898 for (j = 0; j < 16; ++j) {
3899 dest[j * stride + i] = highbd_clip_pixel_add(
3900 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
3906 void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
3907 int stride, int bd) {
3908 tran_low_t out[16 * 16] = { 0 };
3909 tran_low_t *outptr = out;
3912 __m128i min_input, max_input, temp1, temp2, sign_bits;
3913 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
3914 const __m128i zero = _mm_set1_epi16(0);
3915 const __m128i rounding = _mm_set1_epi16(32);
3916 const __m128i max = _mm_set1_epi16(3155);
3917 const __m128i min = _mm_set1_epi16(-3155);
3918 int optimised_cols = 0;
3920 // Load input into __m128i & pack to 16 bits
3921 for (i = 0; i < 16; i++) {
3922 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
3923 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
3924 inptr[i] = _mm_packs_epi32(temp1, temp2);
3925 temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
3926 temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
3927 inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
3930 // Find the min & max for the row transform
3931 // Since all non-zero dct coefficients are in upper-left 4x4 area,
3932 // we only need to consider first 4 rows here.
3933 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3934 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3935 for (i = 2; i < 4; i++) {
3936 max_input = _mm_max_epi16(max_input, inptr[i]);
3937 min_input = _mm_min_epi16(min_input, inptr[i]);
3939 max_input = _mm_cmpgt_epi16(max_input, max);
3940 min_input = _mm_cmplt_epi16(min_input, min);
3941 temp1 = _mm_or_si128(max_input, min_input);
3942 test = _mm_movemask_epi8(temp1);
3945 // Do the row transform (N.B. This transposes inptr)
3946 idct16_sse2(inptr, inptr + 16);
3948 // Find the min & max for the column transform
3949 // N.B. Only first 4 cols contain non-zero coeffs
3950 max_input = _mm_max_epi16(inptr[0], inptr[1]);
3951 min_input = _mm_min_epi16(inptr[0], inptr[1]);
3952 for (i = 2; i < 16; i++) {
3953 max_input = _mm_max_epi16(max_input, inptr[i]);
3954 min_input = _mm_min_epi16(min_input, inptr[i]);
3956 max_input = _mm_cmpgt_epi16(max_input, max);
3957 min_input = _mm_cmplt_epi16(min_input, min);
3958 temp1 = _mm_or_si128(max_input, min_input);
3959 test = _mm_movemask_epi8(temp1);
3962 // Use fact only first 4 rows contain non-zero coeffs
3963 array_transpose_8x8(inptr, inptr);
3964 array_transpose_8x8(inptr + 8, inptr + 16);
3965 for (i = 0; i < 4; i++) {
3966 sign_bits = _mm_cmplt_epi16(inptr[i], zero);
3967 temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
3968 temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
3969 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
3970 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
3971 sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
3972 temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
3973 temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
3974 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
3975 _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
3978 // Set to use the optimised transform for the column
3982 // Run the un-optimised row transform
3983 for (i = 0; i < 4; ++i) {
3984 vpx_highbd_idct16_c(input, outptr, bd);
3990 if (optimised_cols) {
3991 idct16_sse2(inptr, inptr + 16);
3993 // Final round & shift and Reconstruction and Store
3996 for (i = 0; i < 16; i++) {
3997 inptr[i] = _mm_add_epi16(inptr[i], rounding);
3998 inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
3999 d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
4000 d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
4001 inptr[i] = _mm_srai_epi16(inptr[i], 6);
4002 inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
4003 d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
4004 d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
4006 _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
4007 _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
4011 // Run the un-optimised column transform
4012 tran_low_t temp_in[16], temp_out[16];
4013 for (i = 0; i < 16; ++i) {
4014 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
4015 vpx_highbd_idct16_c(temp_in, temp_out, bd);
4016 for (j = 0; j < 16; ++j) {
4017 dest[j * stride + i] = highbd_clip_pixel_add(
4018 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
4023 #endif // CONFIG_VP9_HIGHBITDEPTH