2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/transpose_sse2.h"
14 #include "vpx_dsp/x86/txfm_common_sse2.h"
16 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
18 const __m128i eight = _mm_set1_epi16(8);
22 in[0] = load_input_data(input);
23 in[1] = load_input_data(input + 8);
29 // Final round and shift
30 in[0] = _mm_add_epi16(in[0], eight);
31 in[1] = _mm_add_epi16(in[1], eight);
32 in[0] = _mm_srai_epi16(in[0], 4);
33 in[1] = _mm_srai_epi16(in[1], 4);
35 recon_and_store4x4_sse2(in, dest, stride);
38 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
40 const __m128i zero = _mm_setzero_si128();
42 __m128i dc_value, d[2];
44 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
45 a = (int)dct_const_round_shift(a * cospi_16_64);
46 a = ROUND_POWER_OF_TWO(a, 4);
48 dc_value = _mm_set1_epi16(a);
50 // Reconstruction and Store
51 d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
52 d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
53 d[0] = _mm_unpacklo_epi32(d[0],
54 _mm_cvtsi32_si128(*(const int *)(dest + stride)));
55 d[1] = _mm_unpacklo_epi32(
56 _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
57 d[0] = _mm_unpacklo_epi8(d[0], zero);
58 d[1] = _mm_unpacklo_epi8(d[1], zero);
59 d[0] = _mm_add_epi16(d[0], dc_value);
60 d[1] = _mm_add_epi16(d[1], dc_value);
61 d[0] = _mm_packus_epi16(d[0], d[1]);
63 *(int *)dest = _mm_cvtsi128_si32(d[0]);
64 d[0] = _mm_srli_si128(d[0], 4);
65 *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
66 d[0] = _mm_srli_si128(d[0], 4);
67 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
68 d[0] = _mm_srli_si128(d[0], 4);
69 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
72 void idct4_sse2(__m128i *in) {
73 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
74 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
75 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
76 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
79 transpose_16bit_4x4(in);
81 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
82 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
83 u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
84 u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
87 in[0] = _mm_add_epi16(u[0], u[1]);
88 in[1] = _mm_sub_epi16(u[0], u[1]);
89 in[1] = _mm_shuffle_epi32(in[1], 0x4E);
92 void iadst4_sse2(__m128i *in) {
93 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
94 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
95 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
96 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
97 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
98 const __m128i kZero = _mm_set1_epi16(0);
99 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
100 __m128i u[8], v[8], in7;
102 transpose_16bit_4x4(in);
103 in7 = _mm_srli_si128(in[1], 8);
104 in7 = _mm_add_epi16(in7, in[0]);
105 in7 = _mm_sub_epi16(in7, in[1]);
107 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
108 u[1] = _mm_unpackhi_epi16(in[0], in[1]);
109 u[2] = _mm_unpacklo_epi16(in7, kZero);
110 u[3] = _mm_unpackhi_epi16(in[0], kZero);
112 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
113 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
114 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
115 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
116 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
117 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
119 u[0] = _mm_add_epi32(v[0], v[1]);
120 u[1] = _mm_add_epi32(v[3], v[4]);
122 u[3] = _mm_add_epi32(u[0], u[1]);
123 u[4] = _mm_slli_epi32(v[5], 2);
124 u[5] = _mm_add_epi32(u[3], v[5]);
125 u[6] = _mm_sub_epi32(u[5], u[4]);
127 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
128 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
129 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
130 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
132 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
133 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
134 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
135 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
137 in[0] = _mm_packs_epi32(u[0], u[1]);
138 in[1] = _mm_packs_epi32(u[2], u[3]);
141 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
143 res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0); \
144 res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1); \
147 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
148 out4, out5, out6, out7) \
152 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
153 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
154 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
155 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
157 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \
158 stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \
163 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
164 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
165 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
166 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
168 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \
169 stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \
171 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
172 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
173 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
174 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
179 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
180 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
182 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
183 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
184 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
185 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
187 stp1_5 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_1); \
188 stp1_6 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_0); \
192 out0 = _mm_add_epi16(stp1_0, stp2_7); \
193 out1 = _mm_add_epi16(stp1_1, stp1_6); \
194 out2 = _mm_add_epi16(stp1_2, stp1_5); \
195 out3 = _mm_add_epi16(stp1_3, stp2_4); \
196 out4 = _mm_sub_epi16(stp1_3, stp2_4); \
197 out5 = _mm_sub_epi16(stp1_2, stp1_5); \
198 out6 = _mm_sub_epi16(stp1_1, stp1_6); \
199 out7 = _mm_sub_epi16(stp1_0, stp2_7); \
202 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
204 const __m128i zero = _mm_setzero_si128();
205 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
206 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
207 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
208 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
209 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
210 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
211 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
212 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
213 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
215 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
216 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
217 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
221 in0 = load_input_data(input);
222 in1 = load_input_data(input + 8 * 1);
223 in2 = load_input_data(input + 8 * 2);
224 in3 = load_input_data(input + 8 * 3);
225 in4 = load_input_data(input + 8 * 4);
226 in5 = load_input_data(input + 8 * 5);
227 in6 = load_input_data(input + 8 * 6);
228 in7 = load_input_data(input + 8 * 7);
231 for (i = 0; i < 2; i++) {
232 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
233 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
236 // 4-stage 1D idct8x8
237 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
241 // Final rounding and shift
242 in0 = _mm_adds_epi16(in0, final_rounding);
243 in1 = _mm_adds_epi16(in1, final_rounding);
244 in2 = _mm_adds_epi16(in2, final_rounding);
245 in3 = _mm_adds_epi16(in3, final_rounding);
246 in4 = _mm_adds_epi16(in4, final_rounding);
247 in5 = _mm_adds_epi16(in5, final_rounding);
248 in6 = _mm_adds_epi16(in6, final_rounding);
249 in7 = _mm_adds_epi16(in7, final_rounding);
251 in0 = _mm_srai_epi16(in0, 5);
252 in1 = _mm_srai_epi16(in1, 5);
253 in2 = _mm_srai_epi16(in2, 5);
254 in3 = _mm_srai_epi16(in3, 5);
255 in4 = _mm_srai_epi16(in4, 5);
256 in5 = _mm_srai_epi16(in5, 5);
257 in6 = _mm_srai_epi16(in6, 5);
258 in7 = _mm_srai_epi16(in7, 5);
260 RECON_AND_STORE(dest + 0 * stride, in0);
261 RECON_AND_STORE(dest + 1 * stride, in1);
262 RECON_AND_STORE(dest + 2 * stride, in2);
263 RECON_AND_STORE(dest + 3 * stride, in3);
264 RECON_AND_STORE(dest + 4 * stride, in4);
265 RECON_AND_STORE(dest + 5 * stride, in5);
266 RECON_AND_STORE(dest + 6 * stride, in6);
267 RECON_AND_STORE(dest + 7 * stride, in7);
270 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
273 const __m128i zero = _mm_setzero_si128();
276 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
277 a = (int)dct_const_round_shift(a * cospi_16_64);
278 a = ROUND_POWER_OF_TWO(a, 5);
280 dc_value = _mm_set1_epi16(a);
282 RECON_AND_STORE(dest + 0 * stride, dc_value);
283 RECON_AND_STORE(dest + 1 * stride, dc_value);
284 RECON_AND_STORE(dest + 2 * stride, dc_value);
285 RECON_AND_STORE(dest + 3 * stride, dc_value);
286 RECON_AND_STORE(dest + 4 * stride, dc_value);
287 RECON_AND_STORE(dest + 5 * stride, dc_value);
288 RECON_AND_STORE(dest + 6 * stride, dc_value);
289 RECON_AND_STORE(dest + 7 * stride, dc_value);
292 void idct8_sse2(__m128i *in) {
293 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
294 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
295 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
296 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
297 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
298 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
299 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
300 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
302 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
303 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
304 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
306 // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
307 TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
308 in1, in2, in3, in4, in5, in6, in7);
310 // 4-stage 1D idct8x8
311 IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
312 in[4], in[5], in[6], in[7]);
315 void iadst8_sse2(__m128i *in) {
316 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
317 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
318 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
319 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
320 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
321 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
322 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
323 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
324 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
325 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
326 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
327 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
328 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
329 const __m128i k__const_0 = _mm_set1_epi16(0);
330 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
332 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
333 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
334 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
335 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
336 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
339 array_transpose_8x8(in, in);
341 // properly aligned for butterfly input
351 // column transformation
353 // interleave and multiply/add into 32-bit integer
354 s0 = _mm_unpacklo_epi16(in0, in1);
355 s1 = _mm_unpackhi_epi16(in0, in1);
356 s2 = _mm_unpacklo_epi16(in2, in3);
357 s3 = _mm_unpackhi_epi16(in2, in3);
358 s4 = _mm_unpacklo_epi16(in4, in5);
359 s5 = _mm_unpackhi_epi16(in4, in5);
360 s6 = _mm_unpacklo_epi16(in6, in7);
361 s7 = _mm_unpackhi_epi16(in6, in7);
363 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
364 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
365 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
366 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
367 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
368 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
369 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
370 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
371 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
372 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
373 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
374 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
375 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
376 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
377 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
378 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
381 w0 = _mm_add_epi32(u0, u8);
382 w1 = _mm_add_epi32(u1, u9);
383 w2 = _mm_add_epi32(u2, u10);
384 w3 = _mm_add_epi32(u3, u11);
385 w4 = _mm_add_epi32(u4, u12);
386 w5 = _mm_add_epi32(u5, u13);
387 w6 = _mm_add_epi32(u6, u14);
388 w7 = _mm_add_epi32(u7, u15);
389 w8 = _mm_sub_epi32(u0, u8);
390 w9 = _mm_sub_epi32(u1, u9);
391 w10 = _mm_sub_epi32(u2, u10);
392 w11 = _mm_sub_epi32(u3, u11);
393 w12 = _mm_sub_epi32(u4, u12);
394 w13 = _mm_sub_epi32(u5, u13);
395 w14 = _mm_sub_epi32(u6, u14);
396 w15 = _mm_sub_epi32(u7, u15);
398 // shift and rounding
399 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
400 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
401 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
402 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
403 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
404 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
405 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
406 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
407 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
408 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
409 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
410 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
411 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
412 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
413 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
414 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
416 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
417 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
418 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
419 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
420 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
421 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
422 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
423 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
424 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
425 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
426 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
427 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
428 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
429 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
430 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
431 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
433 // back to 16-bit and pack 8 integers into __m128i
434 in[0] = _mm_packs_epi32(u0, u1);
435 in[1] = _mm_packs_epi32(u2, u3);
436 in[2] = _mm_packs_epi32(u4, u5);
437 in[3] = _mm_packs_epi32(u6, u7);
438 in[4] = _mm_packs_epi32(u8, u9);
439 in[5] = _mm_packs_epi32(u10, u11);
440 in[6] = _mm_packs_epi32(u12, u13);
441 in[7] = _mm_packs_epi32(u14, u15);
444 s0 = _mm_add_epi16(in[0], in[2]);
445 s1 = _mm_add_epi16(in[1], in[3]);
446 s2 = _mm_sub_epi16(in[0], in[2]);
447 s3 = _mm_sub_epi16(in[1], in[3]);
448 u0 = _mm_unpacklo_epi16(in[4], in[5]);
449 u1 = _mm_unpackhi_epi16(in[4], in[5]);
450 u2 = _mm_unpacklo_epi16(in[6], in[7]);
451 u3 = _mm_unpackhi_epi16(in[6], in[7]);
453 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
454 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
455 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
456 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
457 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
458 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
459 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
460 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
462 w0 = _mm_add_epi32(v0, v4);
463 w1 = _mm_add_epi32(v1, v5);
464 w2 = _mm_add_epi32(v2, v6);
465 w3 = _mm_add_epi32(v3, v7);
466 w4 = _mm_sub_epi32(v0, v4);
467 w5 = _mm_sub_epi32(v1, v5);
468 w6 = _mm_sub_epi32(v2, v6);
469 w7 = _mm_sub_epi32(v3, v7);
471 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
472 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
473 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
474 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
475 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
476 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
477 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
478 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
480 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
481 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
482 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
483 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
484 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
485 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
486 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
487 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
489 // back to 16-bit intergers
490 s4 = _mm_packs_epi32(u0, u1);
491 s5 = _mm_packs_epi32(u2, u3);
492 s6 = _mm_packs_epi32(u4, u5);
493 s7 = _mm_packs_epi32(u6, u7);
496 u0 = _mm_unpacklo_epi16(s2, s3);
497 u1 = _mm_unpackhi_epi16(s2, s3);
498 u2 = _mm_unpacklo_epi16(s6, s7);
499 u3 = _mm_unpackhi_epi16(s6, s7);
501 s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
502 s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
503 s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
504 s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
507 in[1] = _mm_sub_epi16(k__const_0, s4);
509 in[3] = _mm_sub_epi16(k__const_0, s2);
511 in[5] = _mm_sub_epi16(k__const_0, s7);
513 in[7] = _mm_sub_epi16(k__const_0, s1);
516 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
518 const __m128i zero = _mm_setzero_si128();
519 const __m128i final_rounding = _mm_set1_epi16(1 << 4);
520 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
521 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
522 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
523 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
524 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
525 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
526 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
527 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
528 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
530 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
531 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
532 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
533 __m128i tmp0, tmp1, tmp2, tmp3;
535 // Rows. Load 4-row input data.
536 in0 = load_input_data(input);
537 in1 = load_input_data(input + 8 * 1);
538 in2 = load_input_data(input + 8 * 2);
539 in3 = load_input_data(input + 8 * 3);
542 TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
545 const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
546 const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
548 stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
549 stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
554 const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
555 const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
557 stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
558 stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
560 tmp0 = _mm_add_epi16(stp1_4, stp1_5);
561 tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
564 stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
565 stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
570 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
572 tmp0 = _mm_add_epi16(stp2_0, stp2_2);
573 tmp1 = _mm_sub_epi16(stp2_0, stp2_2);
574 stp1_2 = _mm_unpackhi_epi64(tmp1, tmp0);
575 stp1_3 = _mm_unpacklo_epi64(tmp1, tmp0);
576 stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56); // stg3_1 = stg2_0
580 tmp0 = _mm_add_epi16(stp1_3, stp2_4);
581 tmp1 = _mm_add_epi16(stp1_2, stp1_5);
582 tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
583 tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
585 TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
587 IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
589 // Final rounding and shift
590 in0 = _mm_adds_epi16(in0, final_rounding);
591 in1 = _mm_adds_epi16(in1, final_rounding);
592 in2 = _mm_adds_epi16(in2, final_rounding);
593 in3 = _mm_adds_epi16(in3, final_rounding);
594 in4 = _mm_adds_epi16(in4, final_rounding);
595 in5 = _mm_adds_epi16(in5, final_rounding);
596 in6 = _mm_adds_epi16(in6, final_rounding);
597 in7 = _mm_adds_epi16(in7, final_rounding);
599 in0 = _mm_srai_epi16(in0, 5);
600 in1 = _mm_srai_epi16(in1, 5);
601 in2 = _mm_srai_epi16(in2, 5);
602 in3 = _mm_srai_epi16(in3, 5);
603 in4 = _mm_srai_epi16(in4, 5);
604 in5 = _mm_srai_epi16(in5, 5);
605 in6 = _mm_srai_epi16(in6, 5);
606 in7 = _mm_srai_epi16(in7, 5);
608 RECON_AND_STORE(dest + 0 * stride, in0);
609 RECON_AND_STORE(dest + 1 * stride, in1);
610 RECON_AND_STORE(dest + 2 * stride, in2);
611 RECON_AND_STORE(dest + 3 * stride, in3);
612 RECON_AND_STORE(dest + 4 * stride, in4);
613 RECON_AND_STORE(dest + 5 * stride, in5);
614 RECON_AND_STORE(dest + 6 * stride, in6);
615 RECON_AND_STORE(dest + 7 * stride, in7);
621 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
622 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
623 const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \
624 const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \
625 const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
626 const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
627 const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
628 const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
630 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \
631 stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \
633 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
634 stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
639 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
640 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
641 const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
642 const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
644 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
645 stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \
647 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
648 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
649 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
650 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
652 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
653 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
654 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
655 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
660 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
661 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
662 const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
663 const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
665 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
666 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
667 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
668 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
670 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \
671 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
673 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
674 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
675 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
676 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
678 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
679 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
685 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
686 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
688 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
689 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
690 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
691 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
693 stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
694 stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
696 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
697 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
698 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
699 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
701 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
702 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
703 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
704 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
709 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
710 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
711 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
712 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
714 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
715 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
716 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
717 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
718 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
719 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
720 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
721 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
723 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
724 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
731 const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \
732 const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \
733 const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \
734 const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \
736 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
737 stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \
743 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \
744 const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \
746 MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
751 stp1_13 = stp1_12_0; \
757 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \
758 const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \
760 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
761 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
762 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
763 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
765 MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \
769 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
770 stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \
776 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
777 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
782 stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
783 stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
785 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
786 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
787 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
788 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
790 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
791 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
792 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
793 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
798 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
799 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
800 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
801 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
803 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
804 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
805 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
806 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
807 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
808 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
809 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
810 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
812 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
813 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
817 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
819 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
820 const __m128i zero = _mm_setzero_si128();
822 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
823 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
824 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
825 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
826 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
827 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
828 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
829 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
831 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
832 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
833 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
834 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
836 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
837 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
838 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
839 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
840 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
841 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
842 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
843 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
845 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
847 __m128i in[16], l[16], r[16], *curr1;
848 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
849 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
851 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
852 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
856 for (i = 0; i < 2; i++) {
860 in[0] = load_input_data(input);
861 in[8] = load_input_data(input + 8 * 1);
862 in[1] = load_input_data(input + 8 * 2);
863 in[9] = load_input_data(input + 8 * 3);
864 in[2] = load_input_data(input + 8 * 4);
865 in[10] = load_input_data(input + 8 * 5);
866 in[3] = load_input_data(input + 8 * 6);
867 in[11] = load_input_data(input + 8 * 7);
868 in[4] = load_input_data(input + 8 * 8);
869 in[12] = load_input_data(input + 8 * 9);
870 in[5] = load_input_data(input + 8 * 10);
871 in[13] = load_input_data(input + 8 * 11);
872 in[6] = load_input_data(input + 8 * 12);
873 in[14] = load_input_data(input + 8 * 13);
874 in[7] = load_input_data(input + 8 * 14);
875 in[15] = load_input_data(input + 8 * 15);
877 array_transpose_8x8(in, in);
878 array_transpose_8x8(in + 8, in + 8);
883 curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
884 curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
885 curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
886 curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
887 curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
888 curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
889 curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
890 curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
891 curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
892 curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
893 curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
894 curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
895 curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
896 curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
897 curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
898 curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
903 for (i = 0; i < 2; i++) {
906 array_transpose_8x8(l + i * 8, in);
907 array_transpose_8x8(r + i * 8, in + 8);
912 in[0] = _mm_add_epi16(stp2_0, stp1_15);
913 in[1] = _mm_add_epi16(stp2_1, stp1_14);
914 in[2] = _mm_add_epi16(stp2_2, stp2_13);
915 in[3] = _mm_add_epi16(stp2_3, stp2_12);
916 in[4] = _mm_add_epi16(stp2_4, stp2_11);
917 in[5] = _mm_add_epi16(stp2_5, stp2_10);
918 in[6] = _mm_add_epi16(stp2_6, stp1_9);
919 in[7] = _mm_add_epi16(stp2_7, stp1_8);
920 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
921 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
922 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
923 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
924 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
925 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
926 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
927 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
929 for (j = 0; j < 16; ++j) {
930 // Final rounding and shift
931 in[j] = _mm_adds_epi16(in[j], final_rounding);
932 in[j] = _mm_srai_epi16(in[j], 6);
933 RECON_AND_STORE(dest + j * stride, in[j]);
940 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
943 const __m128i zero = _mm_setzero_si128();
946 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
947 a = (int)dct_const_round_shift(a * cospi_16_64);
948 a = ROUND_POWER_OF_TWO(a, 6);
950 dc_value = _mm_set1_epi16(a);
952 for (i = 0; i < 16; ++i) {
953 RECON_AND_STORE(dest + 0, dc_value);
954 RECON_AND_STORE(dest + 8, dc_value);
959 static void iadst16_8col(__m128i *in) {
960 // perform 16x16 1-D ADST for 8 columns
961 __m128i s[16], x[16], u[32], v[32];
962 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
963 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
964 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
965 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
966 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
967 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
968 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
969 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
970 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
971 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
972 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
973 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
974 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
975 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
976 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
977 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
978 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
979 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
980 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
981 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
982 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
983 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
984 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
985 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
986 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
987 const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
988 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
989 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
990 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
991 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
992 const __m128i kZero = _mm_set1_epi16(0);
994 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
995 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
996 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
997 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
998 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
999 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1000 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1001 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1002 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1003 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1004 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1005 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1006 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1007 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1008 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1009 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1011 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1012 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1013 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1014 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1015 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1016 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1017 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1018 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1019 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1020 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1021 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1022 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1023 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1024 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1025 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1026 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1027 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1028 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1029 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1030 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1031 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1032 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1033 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1034 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1035 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1036 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1037 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1038 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1039 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1040 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1041 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1042 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1044 u[0] = _mm_add_epi32(v[0], v[16]);
1045 u[1] = _mm_add_epi32(v[1], v[17]);
1046 u[2] = _mm_add_epi32(v[2], v[18]);
1047 u[3] = _mm_add_epi32(v[3], v[19]);
1048 u[4] = _mm_add_epi32(v[4], v[20]);
1049 u[5] = _mm_add_epi32(v[5], v[21]);
1050 u[6] = _mm_add_epi32(v[6], v[22]);
1051 u[7] = _mm_add_epi32(v[7], v[23]);
1052 u[8] = _mm_add_epi32(v[8], v[24]);
1053 u[9] = _mm_add_epi32(v[9], v[25]);
1054 u[10] = _mm_add_epi32(v[10], v[26]);
1055 u[11] = _mm_add_epi32(v[11], v[27]);
1056 u[12] = _mm_add_epi32(v[12], v[28]);
1057 u[13] = _mm_add_epi32(v[13], v[29]);
1058 u[14] = _mm_add_epi32(v[14], v[30]);
1059 u[15] = _mm_add_epi32(v[15], v[31]);
1060 u[16] = _mm_sub_epi32(v[0], v[16]);
1061 u[17] = _mm_sub_epi32(v[1], v[17]);
1062 u[18] = _mm_sub_epi32(v[2], v[18]);
1063 u[19] = _mm_sub_epi32(v[3], v[19]);
1064 u[20] = _mm_sub_epi32(v[4], v[20]);
1065 u[21] = _mm_sub_epi32(v[5], v[21]);
1066 u[22] = _mm_sub_epi32(v[6], v[22]);
1067 u[23] = _mm_sub_epi32(v[7], v[23]);
1068 u[24] = _mm_sub_epi32(v[8], v[24]);
1069 u[25] = _mm_sub_epi32(v[9], v[25]);
1070 u[26] = _mm_sub_epi32(v[10], v[26]);
1071 u[27] = _mm_sub_epi32(v[11], v[27]);
1072 u[28] = _mm_sub_epi32(v[12], v[28]);
1073 u[29] = _mm_sub_epi32(v[13], v[29]);
1074 u[30] = _mm_sub_epi32(v[14], v[30]);
1075 u[31] = _mm_sub_epi32(v[15], v[31]);
1077 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1078 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1079 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1080 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1081 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1082 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1083 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1084 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1085 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1086 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1087 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1088 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1089 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1090 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1091 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1092 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1093 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1094 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1095 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1096 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1097 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1098 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1099 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1100 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1101 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1102 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1103 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1104 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1105 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1106 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1107 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1108 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1110 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1111 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1112 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1113 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1114 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1115 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1116 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1117 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1118 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1119 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1120 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1121 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1122 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1123 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1124 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1125 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1126 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1127 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1128 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1129 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1130 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1131 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1132 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1133 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1134 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1135 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1136 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1137 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1138 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1139 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1140 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1141 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1143 s[0] = _mm_packs_epi32(u[0], u[1]);
1144 s[1] = _mm_packs_epi32(u[2], u[3]);
1145 s[2] = _mm_packs_epi32(u[4], u[5]);
1146 s[3] = _mm_packs_epi32(u[6], u[7]);
1147 s[4] = _mm_packs_epi32(u[8], u[9]);
1148 s[5] = _mm_packs_epi32(u[10], u[11]);
1149 s[6] = _mm_packs_epi32(u[12], u[13]);
1150 s[7] = _mm_packs_epi32(u[14], u[15]);
1151 s[8] = _mm_packs_epi32(u[16], u[17]);
1152 s[9] = _mm_packs_epi32(u[18], u[19]);
1153 s[10] = _mm_packs_epi32(u[20], u[21]);
1154 s[11] = _mm_packs_epi32(u[22], u[23]);
1155 s[12] = _mm_packs_epi32(u[24], u[25]);
1156 s[13] = _mm_packs_epi32(u[26], u[27]);
1157 s[14] = _mm_packs_epi32(u[28], u[29]);
1158 s[15] = _mm_packs_epi32(u[30], u[31]);
1161 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1162 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1163 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1164 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1165 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1166 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1167 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1168 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1170 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1171 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1172 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1173 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1174 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1175 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1176 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1177 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1178 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1179 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1180 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1181 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1182 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1183 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1184 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1185 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1187 u[0] = _mm_add_epi32(v[0], v[8]);
1188 u[1] = _mm_add_epi32(v[1], v[9]);
1189 u[2] = _mm_add_epi32(v[2], v[10]);
1190 u[3] = _mm_add_epi32(v[3], v[11]);
1191 u[4] = _mm_add_epi32(v[4], v[12]);
1192 u[5] = _mm_add_epi32(v[5], v[13]);
1193 u[6] = _mm_add_epi32(v[6], v[14]);
1194 u[7] = _mm_add_epi32(v[7], v[15]);
1195 u[8] = _mm_sub_epi32(v[0], v[8]);
1196 u[9] = _mm_sub_epi32(v[1], v[9]);
1197 u[10] = _mm_sub_epi32(v[2], v[10]);
1198 u[11] = _mm_sub_epi32(v[3], v[11]);
1199 u[12] = _mm_sub_epi32(v[4], v[12]);
1200 u[13] = _mm_sub_epi32(v[5], v[13]);
1201 u[14] = _mm_sub_epi32(v[6], v[14]);
1202 u[15] = _mm_sub_epi32(v[7], v[15]);
1204 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1205 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1206 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1207 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1208 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1209 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1210 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1211 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1212 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1213 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1214 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1215 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1216 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1217 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1218 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1219 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1221 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1222 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1223 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1224 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1225 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1226 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1227 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1228 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1229 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1230 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1231 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1232 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1233 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1234 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1235 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1236 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1238 x[0] = _mm_add_epi16(s[0], s[4]);
1239 x[1] = _mm_add_epi16(s[1], s[5]);
1240 x[2] = _mm_add_epi16(s[2], s[6]);
1241 x[3] = _mm_add_epi16(s[3], s[7]);
1242 x[4] = _mm_sub_epi16(s[0], s[4]);
1243 x[5] = _mm_sub_epi16(s[1], s[5]);
1244 x[6] = _mm_sub_epi16(s[2], s[6]);
1245 x[7] = _mm_sub_epi16(s[3], s[7]);
1246 x[8] = _mm_packs_epi32(u[0], u[1]);
1247 x[9] = _mm_packs_epi32(u[2], u[3]);
1248 x[10] = _mm_packs_epi32(u[4], u[5]);
1249 x[11] = _mm_packs_epi32(u[6], u[7]);
1250 x[12] = _mm_packs_epi32(u[8], u[9]);
1251 x[13] = _mm_packs_epi32(u[10], u[11]);
1252 x[14] = _mm_packs_epi32(u[12], u[13]);
1253 x[15] = _mm_packs_epi32(u[14], u[15]);
1256 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1257 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1258 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1259 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1260 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1261 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1262 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1263 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1265 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1266 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1267 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1268 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1269 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1270 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1271 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1272 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1273 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1274 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1275 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1276 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1277 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1278 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1279 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1280 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1282 u[0] = _mm_add_epi32(v[0], v[4]);
1283 u[1] = _mm_add_epi32(v[1], v[5]);
1284 u[2] = _mm_add_epi32(v[2], v[6]);
1285 u[3] = _mm_add_epi32(v[3], v[7]);
1286 u[4] = _mm_sub_epi32(v[0], v[4]);
1287 u[5] = _mm_sub_epi32(v[1], v[5]);
1288 u[6] = _mm_sub_epi32(v[2], v[6]);
1289 u[7] = _mm_sub_epi32(v[3], v[7]);
1290 u[8] = _mm_add_epi32(v[8], v[12]);
1291 u[9] = _mm_add_epi32(v[9], v[13]);
1292 u[10] = _mm_add_epi32(v[10], v[14]);
1293 u[11] = _mm_add_epi32(v[11], v[15]);
1294 u[12] = _mm_sub_epi32(v[8], v[12]);
1295 u[13] = _mm_sub_epi32(v[9], v[13]);
1296 u[14] = _mm_sub_epi32(v[10], v[14]);
1297 u[15] = _mm_sub_epi32(v[11], v[15]);
1299 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1300 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1301 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1302 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1303 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1304 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1305 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1306 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1307 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1308 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1309 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1310 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1311 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1312 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1313 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1314 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1316 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1317 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1318 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1319 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1320 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1321 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1322 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1323 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1324 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1325 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1326 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1327 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1328 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1329 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1330 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1331 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1333 s[0] = _mm_add_epi16(x[0], x[2]);
1334 s[1] = _mm_add_epi16(x[1], x[3]);
1335 s[2] = _mm_sub_epi16(x[0], x[2]);
1336 s[3] = _mm_sub_epi16(x[1], x[3]);
1337 s[4] = _mm_packs_epi32(v[0], v[1]);
1338 s[5] = _mm_packs_epi32(v[2], v[3]);
1339 s[6] = _mm_packs_epi32(v[4], v[5]);
1340 s[7] = _mm_packs_epi32(v[6], v[7]);
1341 s[8] = _mm_add_epi16(x[8], x[10]);
1342 s[9] = _mm_add_epi16(x[9], x[11]);
1343 s[10] = _mm_sub_epi16(x[8], x[10]);
1344 s[11] = _mm_sub_epi16(x[9], x[11]);
1345 s[12] = _mm_packs_epi32(v[8], v[9]);
1346 s[13] = _mm_packs_epi32(v[10], v[11]);
1347 s[14] = _mm_packs_epi32(v[12], v[13]);
1348 s[15] = _mm_packs_epi32(v[14], v[15]);
1351 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1352 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1353 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1354 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1355 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1356 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1357 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1358 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1360 in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
1361 in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
1362 in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
1363 in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
1364 in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
1365 in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
1366 in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
1367 in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
1370 in[1] = _mm_sub_epi16(kZero, s[8]);
1372 in[3] = _mm_sub_epi16(kZero, s[4]);
1374 in[13] = _mm_sub_epi16(kZero, s[13]);
1376 in[15] = _mm_sub_epi16(kZero, s[1]);
1379 static void idct16_8col(__m128i *in) {
1380 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1381 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1382 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1383 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1384 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1385 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1386 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1387 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1388 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1389 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1390 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1391 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1392 const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1393 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1394 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1395 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1396 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1397 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1398 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1399 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1400 __m128i u[16], s[16], t[16];
1421 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1422 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1423 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1424 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1425 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1426 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1427 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1428 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1430 s[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p30_m02);
1431 s[15] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p02_p30);
1432 s[9] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p14_m18);
1433 s[14] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p18_p14);
1434 s[10] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p22_m10);
1435 s[13] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p10_p22);
1436 s[11] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p06_m26);
1437 s[12] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p26_p06);
1444 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1445 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1446 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1447 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1449 t[4] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p28_m04);
1450 t[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p04_p28);
1451 t[5] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p12_m20);
1452 t[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p20_p12);
1453 t[8] = _mm_add_epi16(s[8], s[9]);
1454 t[9] = _mm_sub_epi16(s[8], s[9]);
1455 t[10] = _mm_sub_epi16(s[11], s[10]);
1456 t[11] = _mm_add_epi16(s[10], s[11]);
1457 t[12] = _mm_add_epi16(s[12], s[13]);
1458 t[13] = _mm_sub_epi16(s[12], s[13]);
1459 t[14] = _mm_sub_epi16(s[15], s[14]);
1460 t[15] = _mm_add_epi16(s[14], s[15]);
1463 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1464 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1465 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1466 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1467 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1468 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1469 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1470 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1472 s[0] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
1473 s[1] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
1474 s[2] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p24_m08);
1475 s[3] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p08_p24);
1476 s[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m08_p24);
1477 s[14] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p24_p08);
1478 s[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m24_m08);
1479 s[13] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m08_p24);
1480 s[4] = _mm_add_epi16(t[4], t[5]);
1481 s[5] = _mm_sub_epi16(t[4], t[5]);
1482 s[6] = _mm_sub_epi16(t[7], t[6]);
1483 s[7] = _mm_add_epi16(t[6], t[7]);
1490 t[0] = _mm_add_epi16(s[0], s[3]);
1491 t[1] = _mm_add_epi16(s[1], s[2]);
1492 t[2] = _mm_sub_epi16(s[1], s[2]);
1493 t[3] = _mm_sub_epi16(s[0], s[3]);
1497 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
1498 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
1499 t[5] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
1500 t[6] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
1502 t[8] = _mm_add_epi16(s[8], s[11]);
1503 t[9] = _mm_add_epi16(s[9], s[10]);
1504 t[10] = _mm_sub_epi16(s[9], s[10]);
1505 t[11] = _mm_sub_epi16(s[8], s[11]);
1506 t[12] = _mm_sub_epi16(s[15], s[12]);
1507 t[13] = _mm_sub_epi16(s[14], s[13]);
1508 t[14] = _mm_add_epi16(s[13], s[14]);
1509 t[15] = _mm_add_epi16(s[12], s[15]);
1512 s[0] = _mm_add_epi16(t[0], t[7]);
1513 s[1] = _mm_add_epi16(t[1], t[6]);
1514 s[2] = _mm_add_epi16(t[2], t[5]);
1515 s[3] = _mm_add_epi16(t[3], t[4]);
1516 s[4] = _mm_sub_epi16(t[3], t[4]);
1517 s[5] = _mm_sub_epi16(t[2], t[5]);
1518 s[6] = _mm_sub_epi16(t[1], t[6]);
1519 s[7] = _mm_sub_epi16(t[0], t[7]);
1523 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
1524 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
1525 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
1526 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
1528 s[10] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
1529 s[13] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
1530 s[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
1531 s[12] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
1536 in[0] = _mm_add_epi16(s[0], s[15]);
1537 in[1] = _mm_add_epi16(s[1], s[14]);
1538 in[2] = _mm_add_epi16(s[2], s[13]);
1539 in[3] = _mm_add_epi16(s[3], s[12]);
1540 in[4] = _mm_add_epi16(s[4], s[11]);
1541 in[5] = _mm_add_epi16(s[5], s[10]);
1542 in[6] = _mm_add_epi16(s[6], s[9]);
1543 in[7] = _mm_add_epi16(s[7], s[8]);
1544 in[8] = _mm_sub_epi16(s[7], s[8]);
1545 in[9] = _mm_sub_epi16(s[6], s[9]);
1546 in[10] = _mm_sub_epi16(s[5], s[10]);
1547 in[11] = _mm_sub_epi16(s[4], s[11]);
1548 in[12] = _mm_sub_epi16(s[3], s[12]);
1549 in[13] = _mm_sub_epi16(s[2], s[13]);
1550 in[14] = _mm_sub_epi16(s[1], s[14]);
1551 in[15] = _mm_sub_epi16(s[0], s[15]);
1554 void idct16_sse2(__m128i *in0, __m128i *in1) {
1555 array_transpose_16x16(in0, in1);
1560 void iadst16_sse2(__m128i *in0, __m128i *in1) {
1561 array_transpose_16x16(in0, in1);
1566 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
1568 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1569 const __m128i zero = _mm_setzero_si128();
1571 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1572 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1573 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1574 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1576 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1577 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1579 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1580 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1581 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1582 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1583 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1584 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1586 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1587 __m128i in[16], l[16];
1588 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
1589 stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
1591 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1592 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
1593 __m128i tmp0, tmp1, tmp2, tmp3;
1595 // First 1-D inverse DCT
1597 in[0] = load_input_data(input);
1598 in[1] = load_input_data(input + 8 * 2);
1599 in[2] = load_input_data(input + 8 * 4);
1600 in[3] = load_input_data(input + 8 * 6);
1602 TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
1606 const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
1607 const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
1609 stp2_8 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_1_15);
1610 stp2_11 = idct_calc_wraplow_sse2(stg2_6, stg2_7, lo_13_3);
1615 const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
1617 stp1_4 = idct_calc_wraplow_sse2(stg3_0, stg3_1, lo_2_14);
1618 stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
1619 stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
1624 const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
1625 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
1626 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
1628 tmp0 = idct_madd_round_shift_sse2(lo_0_8, stg4_0);
1629 tmp1 = idct_madd_round_shift_sse2(lo_0_8, stg4_1);
1630 stp1_0 = _mm_packs_epi32(tmp0, tmp0);
1631 stp1_1 = _mm_packs_epi32(tmp1, tmp1);
1632 stp2_9 = idct_calc_wraplow_sse2(stg4_4, stg4_5, lo_9_14);
1633 stp2_10 = idct_calc_wraplow_sse2(stg4_6, stg4_7, lo_10_13);
1635 stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
1638 // Stage5 and Stage6
1640 tmp0 = _mm_add_epi16(stp2_8, stp2_11);
1641 tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
1642 tmp2 = _mm_add_epi16(stp2_9, stp2_10);
1643 tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
1645 stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
1646 stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
1647 stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
1648 stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
1650 stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
1651 stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
1652 stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
1653 stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
1658 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
1659 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
1660 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
1662 stp1_6 = idct_calc_wraplow_sse2(stg4_0, stg4_1, lo_6_5);
1663 tmp0 = idct_madd_round_shift_sse2(lo_10_13, stg6_0);
1664 tmp1 = idct_madd_round_shift_sse2(lo_10_13, stg4_0);
1665 tmp2 = idct_madd_round_shift_sse2(lo_11_12, stg6_0);
1666 tmp3 = idct_madd_round_shift_sse2(lo_11_12, stg4_0);
1668 stp2_10 = _mm_packs_epi32(tmp0, zero);
1669 stp2_13 = _mm_packs_epi32(tmp1, zero);
1670 stp2_11 = _mm_packs_epi32(tmp2, zero);
1671 stp2_12 = _mm_packs_epi32(tmp3, zero);
1673 tmp0 = _mm_add_epi16(stp1_0, stp1_4);
1674 tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
1675 tmp2 = _mm_add_epi16(stp1_1, stp1_6);
1676 tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
1678 stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
1679 stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
1680 stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
1681 stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
1682 stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
1683 stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
1684 stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
1685 stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
1688 // Stage7. Left 8x16 only.
1689 l[0] = _mm_add_epi16(stp2_0, stp1_15);
1690 l[1] = _mm_add_epi16(stp2_1, stp1_14);
1691 l[2] = _mm_add_epi16(stp2_2, stp2_13);
1692 l[3] = _mm_add_epi16(stp2_3, stp2_12);
1693 l[4] = _mm_add_epi16(stp2_4, stp2_11);
1694 l[5] = _mm_add_epi16(stp2_5, stp2_10);
1695 l[6] = _mm_add_epi16(stp2_6, stp1_9);
1696 l[7] = _mm_add_epi16(stp2_7, stp1_8);
1697 l[8] = _mm_sub_epi16(stp2_7, stp1_8);
1698 l[9] = _mm_sub_epi16(stp2_6, stp1_9);
1699 l[10] = _mm_sub_epi16(stp2_5, stp2_10);
1700 l[11] = _mm_sub_epi16(stp2_4, stp2_11);
1701 l[12] = _mm_sub_epi16(stp2_3, stp2_12);
1702 l[13] = _mm_sub_epi16(stp2_2, stp2_13);
1703 l[14] = _mm_sub_epi16(stp2_1, stp1_14);
1704 l[15] = _mm_sub_epi16(stp2_0, stp1_15);
1706 // Second 1-D inverse transform, performed per 8x16 block
1707 for (i = 0; i < 2; i++) {
1709 array_transpose_4X8(l + 8 * i, in);
1714 in[0] = _mm_add_epi16(stp2_0, stp1_15);
1715 in[1] = _mm_add_epi16(stp2_1, stp1_14);
1716 in[2] = _mm_add_epi16(stp2_2, stp2_13);
1717 in[3] = _mm_add_epi16(stp2_3, stp2_12);
1718 in[4] = _mm_add_epi16(stp2_4, stp2_11);
1719 in[5] = _mm_add_epi16(stp2_5, stp2_10);
1720 in[6] = _mm_add_epi16(stp2_6, stp1_9);
1721 in[7] = _mm_add_epi16(stp2_7, stp1_8);
1722 in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1723 in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1724 in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1725 in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1726 in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1727 in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1728 in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1729 in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1731 for (j = 0; j < 16; ++j) {
1732 // Final rounding and shift
1733 in[j] = _mm_adds_epi16(in[j], final_rounding);
1734 in[j] = _mm_srai_epi16(in[j], 6);
1735 RECON_AND_STORE(dest + j * stride, in[j]);
1742 #define LOAD_DQCOEFF(reg, input) \
1744 reg = load_input_data(input); \
1751 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
1752 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
1754 const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \
1755 const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
1757 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
1758 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
1760 const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
1761 const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
1763 MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \
1765 MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \
1767 MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \
1769 MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \
1775 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
1776 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
1778 const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
1779 const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
1781 MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \
1783 MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \
1786 stp2_16 = stp1_16; \
1787 stp2_19 = stp1_19; \
1789 stp2_20 = stp1_20; \
1790 stp2_23 = stp1_23; \
1792 stp2_24 = stp1_24; \
1793 stp2_27 = stp1_27; \
1795 stp2_28 = stp1_28; \
1796 stp2_31 = stp1_31; \
1801 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
1802 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
1804 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
1805 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
1806 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
1807 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
1809 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
1810 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
1811 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
1812 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
1814 MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \
1818 stp1_11 = stp2_11; \
1819 stp1_12 = stp2_12; \
1820 stp1_15 = stp2_15; \
1822 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
1823 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
1825 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
1826 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
1829 stp1_16 = stp2_16; \
1830 stp1_31 = stp2_31; \
1831 stp1_19 = stp2_19; \
1832 stp1_20 = stp2_20; \
1833 stp1_23 = stp2_23; \
1834 stp1_24 = stp2_24; \
1835 stp1_27 = stp2_27; \
1836 stp1_28 = stp2_28; \
1841 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
1842 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
1844 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
1845 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
1846 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
1847 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
1849 MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \
1857 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
1858 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
1862 stp2_15 = stp1_15; \
1863 stp2_11 = stp1_11; \
1864 stp2_12 = stp1_12; \
1866 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
1867 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
1868 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
1869 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
1870 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
1871 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
1872 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
1873 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
1875 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
1876 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
1877 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
1878 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
1879 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
1880 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
1881 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
1882 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
1887 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
1888 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
1889 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
1890 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
1892 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
1893 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
1894 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
1895 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
1897 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
1898 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
1905 stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
1906 stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
1911 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
1912 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
1913 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
1914 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
1915 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
1916 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
1917 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
1918 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
1920 stp1_16 = stp2_16; \
1921 stp1_17 = stp2_17; \
1923 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
1924 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
1926 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
1927 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
1930 stp1_22 = stp2_22; \
1931 stp1_23 = stp2_23; \
1932 stp1_24 = stp2_24; \
1933 stp1_25 = stp2_25; \
1934 stp1_30 = stp2_30; \
1935 stp1_31 = stp2_31; \
1940 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
1941 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
1942 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
1943 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
1945 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
1946 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
1947 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
1948 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
1949 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
1950 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
1951 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
1952 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
1956 stp2_14 = stp1_14; \
1957 stp2_15 = stp1_15; \
1959 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
1960 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
1963 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
1964 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
1965 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
1966 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
1967 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
1968 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
1969 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
1970 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
1972 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
1973 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
1974 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
1975 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
1976 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
1977 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
1978 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
1979 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
1984 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
1985 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
1986 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
1987 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
1989 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
1990 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
1991 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
1992 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
1994 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
1995 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
1996 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
1997 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
1998 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
1999 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2000 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2001 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2002 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2003 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2004 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2005 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2006 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2007 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2008 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2009 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2011 stp1_16 = stp2_16; \
2012 stp1_17 = stp2_17; \
2013 stp1_18 = stp2_18; \
2014 stp1_19 = stp2_19; \
2016 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2017 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
2019 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2020 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
2023 stp1_28 = stp2_28; \
2024 stp1_29 = stp2_29; \
2025 stp1_30 = stp2_30; \
2026 stp1_31 = stp2_31; \
2032 const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
2033 const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
2034 const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
2035 const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
2037 const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
2038 const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
2039 const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); \
2040 const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
2042 const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
2043 const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
2044 const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
2045 const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
2047 const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
2048 const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
2049 const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
2050 const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
2052 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
2053 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \
2055 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
2056 stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
2057 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
2058 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
2060 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
2061 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
2067 const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
2068 const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
2069 const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
2070 const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
2072 const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
2073 const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
2074 const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
2075 const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
2077 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
2078 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
2080 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
2081 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \
2084 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
2085 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
2086 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
2087 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
2089 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
2090 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
2091 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
2092 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
2094 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
2095 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
2096 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
2097 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
2099 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
2100 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
2101 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
2102 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
2107 const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
2108 const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
2109 const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
2110 const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
2112 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
2113 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
2114 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2115 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2117 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2118 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2119 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2120 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2122 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
2123 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
2126 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
2127 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
2128 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
2129 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
2130 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
2131 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
2132 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
2133 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
2135 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
2136 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \
2138 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
2139 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2142 stp1_16 = stp2_16; \
2143 stp1_31 = stp2_31; \
2144 stp1_19 = stp2_19; \
2145 stp1_20 = stp2_20; \
2146 stp1_23 = stp2_23; \
2147 stp1_24 = stp2_24; \
2148 stp1_27 = stp2_27; \
2149 stp1_28 = stp2_28; \
2154 const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
2155 const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
2156 const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
2157 const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
2159 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
2160 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
2161 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2162 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2164 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
2165 stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \
2167 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
2168 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
2169 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
2170 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
2172 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
2173 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \
2177 stp2_15 = stp1_15; \
2178 stp2_11 = stp1_11; \
2179 stp2_12 = stp1_12; \
2181 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
2182 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
2183 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
2184 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
2185 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
2186 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
2187 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
2188 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
2190 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
2191 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
2192 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
2193 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
2194 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
2195 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
2196 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
2197 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
2202 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
2203 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
2204 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
2205 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
2207 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
2208 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
2209 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2210 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2212 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2213 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2215 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
2216 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
2217 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
2218 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
2220 stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
2221 stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
2226 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
2227 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
2228 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
2229 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
2230 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
2231 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
2232 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
2233 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
2235 stp1_16 = stp2_16; \
2236 stp1_17 = stp2_17; \
2238 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
2239 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \
2241 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
2242 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \
2245 stp1_22 = stp2_22; \
2246 stp1_23 = stp2_23; \
2247 stp1_24 = stp2_24; \
2248 stp1_25 = stp2_25; \
2249 stp1_30 = stp2_30; \
2250 stp1_31 = stp2_31; \
2255 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
2256 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
2257 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
2258 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
2260 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
2261 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
2262 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
2263 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
2264 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
2265 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
2266 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
2267 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
2271 stp2_14 = stp1_14; \
2272 stp2_15 = stp1_15; \
2274 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \
2275 stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \
2278 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
2279 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
2280 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
2281 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
2282 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
2283 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
2284 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
2285 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
2287 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
2288 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
2289 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
2290 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
2291 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
2292 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
2293 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
2294 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
2299 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
2300 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
2301 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
2302 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
2304 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
2305 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
2306 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
2307 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
2309 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
2310 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
2311 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
2312 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
2313 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
2314 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
2315 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
2316 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
2317 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
2318 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
2319 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
2320 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
2321 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
2322 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
2323 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
2324 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
2326 stp1_16 = stp2_16; \
2327 stp1_17 = stp2_17; \
2328 stp1_18 = stp2_18; \
2329 stp1_19 = stp2_19; \
2331 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
2332 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \
2334 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
2335 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \
2338 stp1_28 = stp2_28; \
2339 stp1_29 = stp2_29; \
2340 stp1_30 = stp2_30; \
2341 stp1_31 = stp2_31; \
2344 // Only upper-left 8x8 has non-zero coeff
2345 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
2347 const __m128i zero = _mm_setzero_si128();
2348 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2350 // idct constants for each stage
2351 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2352 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2353 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2354 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2355 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2356 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2357 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2358 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2360 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2361 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2362 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2363 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2365 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2366 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2367 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2368 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2369 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2370 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2371 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2372 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2374 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2375 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2376 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2377 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2378 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2380 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2382 __m128i in[32], col[32];
2383 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2384 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2385 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
2386 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
2387 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2388 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2389 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
2390 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
2393 // Load input data. Only need to load the top left 8x8 block.
2394 in[0] = load_input_data(input);
2395 in[1] = load_input_data(input + 32);
2396 in[2] = load_input_data(input + 64);
2397 in[3] = load_input_data(input + 96);
2398 in[4] = load_input_data(input + 128);
2399 in[5] = load_input_data(input + 160);
2400 in[6] = load_input_data(input + 192);
2401 in[7] = load_input_data(input + 224);
2403 array_transpose_8x8(in, in);
2406 // 1_D: Store 32 intermediate results for each 8x32 block.
2407 col[0] = _mm_add_epi16(stp1_0, stp1_31);
2408 col[1] = _mm_add_epi16(stp1_1, stp1_30);
2409 col[2] = _mm_add_epi16(stp1_2, stp1_29);
2410 col[3] = _mm_add_epi16(stp1_3, stp1_28);
2411 col[4] = _mm_add_epi16(stp1_4, stp1_27);
2412 col[5] = _mm_add_epi16(stp1_5, stp1_26);
2413 col[6] = _mm_add_epi16(stp1_6, stp1_25);
2414 col[7] = _mm_add_epi16(stp1_7, stp1_24);
2415 col[8] = _mm_add_epi16(stp1_8, stp1_23);
2416 col[9] = _mm_add_epi16(stp1_9, stp1_22);
2417 col[10] = _mm_add_epi16(stp1_10, stp1_21);
2418 col[11] = _mm_add_epi16(stp1_11, stp1_20);
2419 col[12] = _mm_add_epi16(stp1_12, stp1_19);
2420 col[13] = _mm_add_epi16(stp1_13, stp1_18);
2421 col[14] = _mm_add_epi16(stp1_14, stp1_17);
2422 col[15] = _mm_add_epi16(stp1_15, stp1_16);
2423 col[16] = _mm_sub_epi16(stp1_15, stp1_16);
2424 col[17] = _mm_sub_epi16(stp1_14, stp1_17);
2425 col[18] = _mm_sub_epi16(stp1_13, stp1_18);
2426 col[19] = _mm_sub_epi16(stp1_12, stp1_19);
2427 col[20] = _mm_sub_epi16(stp1_11, stp1_20);
2428 col[21] = _mm_sub_epi16(stp1_10, stp1_21);
2429 col[22] = _mm_sub_epi16(stp1_9, stp1_22);
2430 col[23] = _mm_sub_epi16(stp1_8, stp1_23);
2431 col[24] = _mm_sub_epi16(stp1_7, stp1_24);
2432 col[25] = _mm_sub_epi16(stp1_6, stp1_25);
2433 col[26] = _mm_sub_epi16(stp1_5, stp1_26);
2434 col[27] = _mm_sub_epi16(stp1_4, stp1_27);
2435 col[28] = _mm_sub_epi16(stp1_3, stp1_28);
2436 col[29] = _mm_sub_epi16(stp1_2, stp1_29);
2437 col[30] = _mm_sub_epi16(stp1_1, stp1_30);
2438 col[31] = _mm_sub_epi16(stp1_0, stp1_31);
2439 for (i = 0; i < 4; i++) {
2441 // Transpose 32x8 block to 8x32 block
2442 array_transpose_8x8(col + i * 8, in);
2445 // 2_D: Calculate the results and store them to destination.
2446 in[0] = _mm_add_epi16(stp1_0, stp1_31);
2447 in[1] = _mm_add_epi16(stp1_1, stp1_30);
2448 in[2] = _mm_add_epi16(stp1_2, stp1_29);
2449 in[3] = _mm_add_epi16(stp1_3, stp1_28);
2450 in[4] = _mm_add_epi16(stp1_4, stp1_27);
2451 in[5] = _mm_add_epi16(stp1_5, stp1_26);
2452 in[6] = _mm_add_epi16(stp1_6, stp1_25);
2453 in[7] = _mm_add_epi16(stp1_7, stp1_24);
2454 in[8] = _mm_add_epi16(stp1_8, stp1_23);
2455 in[9] = _mm_add_epi16(stp1_9, stp1_22);
2456 in[10] = _mm_add_epi16(stp1_10, stp1_21);
2457 in[11] = _mm_add_epi16(stp1_11, stp1_20);
2458 in[12] = _mm_add_epi16(stp1_12, stp1_19);
2459 in[13] = _mm_add_epi16(stp1_13, stp1_18);
2460 in[14] = _mm_add_epi16(stp1_14, stp1_17);
2461 in[15] = _mm_add_epi16(stp1_15, stp1_16);
2462 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
2463 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
2464 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
2465 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
2466 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
2467 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
2468 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
2469 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
2470 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
2471 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
2472 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
2473 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
2474 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
2475 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
2476 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
2477 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
2479 for (j = 0; j < 32; ++j) {
2480 // Final rounding and shift
2481 in[j] = _mm_adds_epi16(in[j], final_rounding);
2482 in[j] = _mm_srai_epi16(in[j], 6);
2483 RECON_AND_STORE(dest + j * stride, in[j]);
2490 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
2492 const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2493 const __m128i zero = _mm_setzero_si128();
2495 // idct constants for each stage
2496 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2497 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2498 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2499 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
2500 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2501 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
2502 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2503 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2504 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2505 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2506 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2507 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2508 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2509 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
2510 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2511 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2513 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2514 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2515 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2516 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2517 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2518 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2519 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2520 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2522 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2523 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2524 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2525 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2526 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2527 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2528 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2529 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2530 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2531 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2533 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2534 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2535 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2536 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2537 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2538 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2539 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2541 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2543 __m128i in[32], col[128], zero_idx[16];
2544 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2545 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2546 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
2547 stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
2548 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2549 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2550 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
2551 stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
2554 for (i = 0; i < 4; i++) {
2558 LOAD_DQCOEFF(in[0], input);
2559 LOAD_DQCOEFF(in[8], input);
2560 LOAD_DQCOEFF(in[16], input);
2561 LOAD_DQCOEFF(in[24], input);
2562 LOAD_DQCOEFF(in[1], input);
2563 LOAD_DQCOEFF(in[9], input);
2564 LOAD_DQCOEFF(in[17], input);
2565 LOAD_DQCOEFF(in[25], input);
2566 LOAD_DQCOEFF(in[2], input);
2567 LOAD_DQCOEFF(in[10], input);
2568 LOAD_DQCOEFF(in[18], input);
2569 LOAD_DQCOEFF(in[26], input);
2570 LOAD_DQCOEFF(in[3], input);
2571 LOAD_DQCOEFF(in[11], input);
2572 LOAD_DQCOEFF(in[19], input);
2573 LOAD_DQCOEFF(in[27], input);
2575 LOAD_DQCOEFF(in[4], input);
2576 LOAD_DQCOEFF(in[12], input);
2577 LOAD_DQCOEFF(in[20], input);
2578 LOAD_DQCOEFF(in[28], input);
2579 LOAD_DQCOEFF(in[5], input);
2580 LOAD_DQCOEFF(in[13], input);
2581 LOAD_DQCOEFF(in[21], input);
2582 LOAD_DQCOEFF(in[29], input);
2583 LOAD_DQCOEFF(in[6], input);
2584 LOAD_DQCOEFF(in[14], input);
2585 LOAD_DQCOEFF(in[22], input);
2586 LOAD_DQCOEFF(in[30], input);
2587 LOAD_DQCOEFF(in[7], input);
2588 LOAD_DQCOEFF(in[15], input);
2589 LOAD_DQCOEFF(in[23], input);
2590 LOAD_DQCOEFF(in[31], input);
2592 // checking if all entries are zero
2593 zero_idx[0] = _mm_or_si128(in[0], in[1]);
2594 zero_idx[1] = _mm_or_si128(in[2], in[3]);
2595 zero_idx[2] = _mm_or_si128(in[4], in[5]);
2596 zero_idx[3] = _mm_or_si128(in[6], in[7]);
2597 zero_idx[4] = _mm_or_si128(in[8], in[9]);
2598 zero_idx[5] = _mm_or_si128(in[10], in[11]);
2599 zero_idx[6] = _mm_or_si128(in[12], in[13]);
2600 zero_idx[7] = _mm_or_si128(in[14], in[15]);
2601 zero_idx[8] = _mm_or_si128(in[16], in[17]);
2602 zero_idx[9] = _mm_or_si128(in[18], in[19]);
2603 zero_idx[10] = _mm_or_si128(in[20], in[21]);
2604 zero_idx[11] = _mm_or_si128(in[22], in[23]);
2605 zero_idx[12] = _mm_or_si128(in[24], in[25]);
2606 zero_idx[13] = _mm_or_si128(in[26], in[27]);
2607 zero_idx[14] = _mm_or_si128(in[28], in[29]);
2608 zero_idx[15] = _mm_or_si128(in[30], in[31]);
2610 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
2611 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
2612 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
2613 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
2614 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
2615 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
2616 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
2617 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
2619 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
2620 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
2621 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
2622 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
2623 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
2624 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
2625 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
2627 if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
2628 col[i32 + 0] = _mm_setzero_si128();
2629 col[i32 + 1] = _mm_setzero_si128();
2630 col[i32 + 2] = _mm_setzero_si128();
2631 col[i32 + 3] = _mm_setzero_si128();
2632 col[i32 + 4] = _mm_setzero_si128();
2633 col[i32 + 5] = _mm_setzero_si128();
2634 col[i32 + 6] = _mm_setzero_si128();
2635 col[i32 + 7] = _mm_setzero_si128();
2636 col[i32 + 8] = _mm_setzero_si128();
2637 col[i32 + 9] = _mm_setzero_si128();
2638 col[i32 + 10] = _mm_setzero_si128();
2639 col[i32 + 11] = _mm_setzero_si128();
2640 col[i32 + 12] = _mm_setzero_si128();
2641 col[i32 + 13] = _mm_setzero_si128();
2642 col[i32 + 14] = _mm_setzero_si128();
2643 col[i32 + 15] = _mm_setzero_si128();
2644 col[i32 + 16] = _mm_setzero_si128();
2645 col[i32 + 17] = _mm_setzero_si128();
2646 col[i32 + 18] = _mm_setzero_si128();
2647 col[i32 + 19] = _mm_setzero_si128();
2648 col[i32 + 20] = _mm_setzero_si128();
2649 col[i32 + 21] = _mm_setzero_si128();
2650 col[i32 + 22] = _mm_setzero_si128();
2651 col[i32 + 23] = _mm_setzero_si128();
2652 col[i32 + 24] = _mm_setzero_si128();
2653 col[i32 + 25] = _mm_setzero_si128();
2654 col[i32 + 26] = _mm_setzero_si128();
2655 col[i32 + 27] = _mm_setzero_si128();
2656 col[i32 + 28] = _mm_setzero_si128();
2657 col[i32 + 29] = _mm_setzero_si128();
2658 col[i32 + 30] = _mm_setzero_si128();
2659 col[i32 + 31] = _mm_setzero_si128();
2663 // Transpose 32x8 block to 8x32 block
2664 array_transpose_8x8(in, in);
2665 array_transpose_8x8(in + 8, in + 8);
2666 array_transpose_8x8(in + 16, in + 16);
2667 array_transpose_8x8(in + 24, in + 24);
2671 // 1_D: Store 32 intermediate results for each 8x32 block.
2672 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
2673 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
2674 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
2675 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
2676 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
2677 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
2678 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
2679 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
2680 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
2681 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
2682 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
2683 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
2684 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
2685 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
2686 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
2687 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
2688 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
2689 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
2690 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
2691 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
2692 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
2693 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
2694 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
2695 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
2696 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
2697 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
2698 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
2699 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
2700 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
2701 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
2702 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
2703 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
2705 for (i = 0; i < 4; i++) {
2709 // Transpose 32x8 block to 8x32 block
2710 array_transpose_8x8(col + j, in);
2711 array_transpose_8x8(col + j + 32, in + 8);
2712 array_transpose_8x8(col + j + 64, in + 16);
2713 array_transpose_8x8(col + j + 96, in + 24);
2717 // 2_D: Calculate the results and store them to destination.
2718 in[0] = _mm_add_epi16(stp1_0, stp1_31);
2719 in[1] = _mm_add_epi16(stp1_1, stp1_30);
2720 in[2] = _mm_add_epi16(stp1_2, stp1_29);
2721 in[3] = _mm_add_epi16(stp1_3, stp1_28);
2722 in[4] = _mm_add_epi16(stp1_4, stp1_27);
2723 in[5] = _mm_add_epi16(stp1_5, stp1_26);
2724 in[6] = _mm_add_epi16(stp1_6, stp1_25);
2725 in[7] = _mm_add_epi16(stp1_7, stp1_24);
2726 in[8] = _mm_add_epi16(stp1_8, stp1_23);
2727 in[9] = _mm_add_epi16(stp1_9, stp1_22);
2728 in[10] = _mm_add_epi16(stp1_10, stp1_21);
2729 in[11] = _mm_add_epi16(stp1_11, stp1_20);
2730 in[12] = _mm_add_epi16(stp1_12, stp1_19);
2731 in[13] = _mm_add_epi16(stp1_13, stp1_18);
2732 in[14] = _mm_add_epi16(stp1_14, stp1_17);
2733 in[15] = _mm_add_epi16(stp1_15, stp1_16);
2734 in[16] = _mm_sub_epi16(stp1_15, stp1_16);
2735 in[17] = _mm_sub_epi16(stp1_14, stp1_17);
2736 in[18] = _mm_sub_epi16(stp1_13, stp1_18);
2737 in[19] = _mm_sub_epi16(stp1_12, stp1_19);
2738 in[20] = _mm_sub_epi16(stp1_11, stp1_20);
2739 in[21] = _mm_sub_epi16(stp1_10, stp1_21);
2740 in[22] = _mm_sub_epi16(stp1_9, stp1_22);
2741 in[23] = _mm_sub_epi16(stp1_8, stp1_23);
2742 in[24] = _mm_sub_epi16(stp1_7, stp1_24);
2743 in[25] = _mm_sub_epi16(stp1_6, stp1_25);
2744 in[26] = _mm_sub_epi16(stp1_5, stp1_26);
2745 in[27] = _mm_sub_epi16(stp1_4, stp1_27);
2746 in[28] = _mm_sub_epi16(stp1_3, stp1_28);
2747 in[29] = _mm_sub_epi16(stp1_2, stp1_29);
2748 in[30] = _mm_sub_epi16(stp1_1, stp1_30);
2749 in[31] = _mm_sub_epi16(stp1_0, stp1_31);
2751 for (j = 0; j < 32; ++j) {
2752 // Final rounding and shift
2753 in[j] = _mm_adds_epi16(in[j], final_rounding);
2754 in[j] = _mm_srai_epi16(in[j], 6);
2755 RECON_AND_STORE(dest + j * stride, in[j]);
2762 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
2765 const __m128i zero = _mm_setzero_si128();
2768 a = (int)dct_const_round_shift(input[0] * cospi_16_64);
2769 a = (int)dct_const_round_shift(a * cospi_16_64);
2770 a = ROUND_POWER_OF_TWO(a, 6);
2772 dc_value = _mm_set1_epi16(a);
2774 for (j = 0; j < 32; ++j) {
2775 RECON_AND_STORE(dest + 0 + j * stride, dc_value);
2776 RECON_AND_STORE(dest + 8 + j * stride, dc_value);
2777 RECON_AND_STORE(dest + 16 + j * stride, dc_value);
2778 RECON_AND_STORE(dest + 24 + j * stride, dc_value);