]> granicus.if.org Git - libvpx/blob - vpx_dsp/x86/inv_txfm_sse2.c
Merge "ppc: Add vpx_sadnxmx4d_vsx for n,m = {8, 16, 32 ,64}"
[libvpx] / vpx_dsp / x86 / inv_txfm_sse2.c
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 #include "vpx_dsp/x86/transpose_sse2.h"
14 #include "vpx_dsp/x86/txfm_common_sse2.h"
15
16 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
17                              int stride) {
18   const __m128i eight = _mm_set1_epi16(8);
19   __m128i in[2];
20
21   // Rows
22   in[0] = load_input_data(input);
23   in[1] = load_input_data(input + 8);
24   idct4_sse2(in);
25
26   // Columns
27   idct4_sse2(in);
28
29   // Final round and shift
30   in[0] = _mm_add_epi16(in[0], eight);
31   in[1] = _mm_add_epi16(in[1], eight);
32   in[0] = _mm_srai_epi16(in[0], 4);
33   in[1] = _mm_srai_epi16(in[1], 4);
34
35   recon_and_store4x4_sse2(in, dest, stride);
36 }
37
38 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
39                             int stride) {
40   const __m128i zero = _mm_setzero_si128();
41   int a;
42   __m128i dc_value, d[2];
43
44   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
45   a = (int)dct_const_round_shift(a * cospi_16_64);
46   a = ROUND_POWER_OF_TWO(a, 4);
47
48   dc_value = _mm_set1_epi16(a);
49
50   // Reconstruction and Store
51   d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
52   d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
53   d[0] = _mm_unpacklo_epi32(d[0],
54                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
55   d[1] = _mm_unpacklo_epi32(
56       _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
57   d[0] = _mm_unpacklo_epi8(d[0], zero);
58   d[1] = _mm_unpacklo_epi8(d[1], zero);
59   d[0] = _mm_add_epi16(d[0], dc_value);
60   d[1] = _mm_add_epi16(d[1], dc_value);
61   d[0] = _mm_packus_epi16(d[0], d[1]);
62
63   *(int *)dest = _mm_cvtsi128_si32(d[0]);
64   d[0] = _mm_srli_si128(d[0], 4);
65   *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
66   d[0] = _mm_srli_si128(d[0], 4);
67   *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
68   d[0] = _mm_srli_si128(d[0], 4);
69   *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
70 }
71
72 void idct4_sse2(__m128i *in) {
73   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
74   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
75   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
76   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
77   __m128i u[2];
78
79   transpose_16bit_4x4(in);
80   // stage 1
81   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
82   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
83   u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
84   u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
85
86   // stage 2
87   in[0] = _mm_add_epi16(u[0], u[1]);
88   in[1] = _mm_sub_epi16(u[0], u[1]);
89   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
90 }
91
92 void iadst4_sse2(__m128i *in) {
93   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
94   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
95   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
96   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
97   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
98   const __m128i kZero = _mm_set1_epi16(0);
99   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
100   __m128i u[8], v[8], in7;
101
102   transpose_16bit_4x4(in);
103   in7 = _mm_srli_si128(in[1], 8);
104   in7 = _mm_add_epi16(in7, in[0]);
105   in7 = _mm_sub_epi16(in7, in[1]);
106
107   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
108   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
109   u[2] = _mm_unpacklo_epi16(in7, kZero);
110   u[3] = _mm_unpackhi_epi16(in[0], kZero);
111
112   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
113   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
114   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
115   v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
116   v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
117   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
118
119   u[0] = _mm_add_epi32(v[0], v[1]);
120   u[1] = _mm_add_epi32(v[3], v[4]);
121   u[2] = v[2];
122   u[3] = _mm_add_epi32(u[0], u[1]);
123   u[4] = _mm_slli_epi32(v[5], 2);
124   u[5] = _mm_add_epi32(u[3], v[5]);
125   u[6] = _mm_sub_epi32(u[5], u[4]);
126
127   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
128   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
129   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
130   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
131
132   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
133   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
134   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
135   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
136
137   in[0] = _mm_packs_epi32(u[0], u[1]);
138   in[1] = _mm_packs_epi32(u[2], u[3]);
139 }
140
141 #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
142   {                                                                  \
143     res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0);                 \
144     res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1);                 \
145   }
146
147 #define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
148               out4, out5, out6, out7)                                         \
149   {                                                                           \
150     /* Stage1 */                                                              \
151     {                                                                         \
152       const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
153       const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
154       const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
155       const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
156                                                                               \
157       MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
158                              stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
159     }                                                                         \
160                                                                               \
161     /* Stage2 */                                                              \
162     {                                                                         \
163       const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
164       const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
165       const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
166       const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
167                                                                               \
168       MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
169                              stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
170                                                                               \
171       stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
172       stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
173       stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
174       stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
175     }                                                                         \
176                                                                               \
177     /* Stage3 */                                                              \
178     {                                                                         \
179       const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
180       const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
181                                                                               \
182       stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
183       stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
184       stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
185       stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
186                                                                               \
187       stp1_5 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_1);                  \
188       stp1_6 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_0);                  \
189     }                                                                         \
190                                                                               \
191     /* Stage4  */                                                             \
192     out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
193     out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
194     out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
195     out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
196     out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
197     out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
198     out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
199     out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
200   }
201
202 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
203                              int stride) {
204   const __m128i zero = _mm_setzero_si128();
205   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
206   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
207   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
208   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
209   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
210   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
211   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
212   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
213   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
214
215   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
216   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
217   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
218   int i;
219
220   // Load input data.
221   in0 = load_input_data(input);
222   in1 = load_input_data(input + 8 * 1);
223   in2 = load_input_data(input + 8 * 2);
224   in3 = load_input_data(input + 8 * 3);
225   in4 = load_input_data(input + 8 * 4);
226   in5 = load_input_data(input + 8 * 5);
227   in6 = load_input_data(input + 8 * 6);
228   in7 = load_input_data(input + 8 * 7);
229
230   // 2-D
231   for (i = 0; i < 2; i++) {
232     // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
233     TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
234                   in4, in5, in6, in7);
235
236     // 4-stage 1D idct8x8
237     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
238           in6, in7);
239   }
240
241   // Final rounding and shift
242   in0 = _mm_adds_epi16(in0, final_rounding);
243   in1 = _mm_adds_epi16(in1, final_rounding);
244   in2 = _mm_adds_epi16(in2, final_rounding);
245   in3 = _mm_adds_epi16(in3, final_rounding);
246   in4 = _mm_adds_epi16(in4, final_rounding);
247   in5 = _mm_adds_epi16(in5, final_rounding);
248   in6 = _mm_adds_epi16(in6, final_rounding);
249   in7 = _mm_adds_epi16(in7, final_rounding);
250
251   in0 = _mm_srai_epi16(in0, 5);
252   in1 = _mm_srai_epi16(in1, 5);
253   in2 = _mm_srai_epi16(in2, 5);
254   in3 = _mm_srai_epi16(in3, 5);
255   in4 = _mm_srai_epi16(in4, 5);
256   in5 = _mm_srai_epi16(in5, 5);
257   in6 = _mm_srai_epi16(in6, 5);
258   in7 = _mm_srai_epi16(in7, 5);
259
260   RECON_AND_STORE(dest + 0 * stride, in0);
261   RECON_AND_STORE(dest + 1 * stride, in1);
262   RECON_AND_STORE(dest + 2 * stride, in2);
263   RECON_AND_STORE(dest + 3 * stride, in3);
264   RECON_AND_STORE(dest + 4 * stride, in4);
265   RECON_AND_STORE(dest + 5 * stride, in5);
266   RECON_AND_STORE(dest + 6 * stride, in6);
267   RECON_AND_STORE(dest + 7 * stride, in7);
268 }
269
270 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
271                             int stride) {
272   __m128i dc_value;
273   const __m128i zero = _mm_setzero_si128();
274   int a;
275
276   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
277   a = (int)dct_const_round_shift(a * cospi_16_64);
278   a = ROUND_POWER_OF_TWO(a, 5);
279
280   dc_value = _mm_set1_epi16(a);
281
282   RECON_AND_STORE(dest + 0 * stride, dc_value);
283   RECON_AND_STORE(dest + 1 * stride, dc_value);
284   RECON_AND_STORE(dest + 2 * stride, dc_value);
285   RECON_AND_STORE(dest + 3 * stride, dc_value);
286   RECON_AND_STORE(dest + 4 * stride, dc_value);
287   RECON_AND_STORE(dest + 5 * stride, dc_value);
288   RECON_AND_STORE(dest + 6 * stride, dc_value);
289   RECON_AND_STORE(dest + 7 * stride, dc_value);
290 }
291
292 void idct8_sse2(__m128i *in) {
293   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
294   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
295   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
296   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
297   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
298   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
299   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
300   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
301
302   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
303   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
304   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
305
306   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
307   TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
308                 in1, in2, in3, in4, in5, in6, in7);
309
310   // 4-stage 1D idct8x8
311   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
312         in[4], in[5], in[6], in[7]);
313 }
314
315 void iadst8_sse2(__m128i *in) {
316   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
317   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
318   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
319   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
320   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
321   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
322   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
323   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
324   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
325   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
326   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
327   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
328   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
329   const __m128i k__const_0 = _mm_set1_epi16(0);
330   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
331
332   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
333   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
334   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
335   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
336   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
337
338   // transpose
339   array_transpose_8x8(in, in);
340
341   // properly aligned for butterfly input
342   in0 = in[7];
343   in1 = in[0];
344   in2 = in[5];
345   in3 = in[2];
346   in4 = in[3];
347   in5 = in[4];
348   in6 = in[1];
349   in7 = in[6];
350
351   // column transformation
352   // stage 1
353   // interleave and multiply/add into 32-bit integer
354   s0 = _mm_unpacklo_epi16(in0, in1);
355   s1 = _mm_unpackhi_epi16(in0, in1);
356   s2 = _mm_unpacklo_epi16(in2, in3);
357   s3 = _mm_unpackhi_epi16(in2, in3);
358   s4 = _mm_unpacklo_epi16(in4, in5);
359   s5 = _mm_unpackhi_epi16(in4, in5);
360   s6 = _mm_unpacklo_epi16(in6, in7);
361   s7 = _mm_unpackhi_epi16(in6, in7);
362
363   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
364   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
365   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
366   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
367   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
368   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
369   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
370   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
371   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
372   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
373   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
374   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
375   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
376   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
377   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
378   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
379
380   // addition
381   w0 = _mm_add_epi32(u0, u8);
382   w1 = _mm_add_epi32(u1, u9);
383   w2 = _mm_add_epi32(u2, u10);
384   w3 = _mm_add_epi32(u3, u11);
385   w4 = _mm_add_epi32(u4, u12);
386   w5 = _mm_add_epi32(u5, u13);
387   w6 = _mm_add_epi32(u6, u14);
388   w7 = _mm_add_epi32(u7, u15);
389   w8 = _mm_sub_epi32(u0, u8);
390   w9 = _mm_sub_epi32(u1, u9);
391   w10 = _mm_sub_epi32(u2, u10);
392   w11 = _mm_sub_epi32(u3, u11);
393   w12 = _mm_sub_epi32(u4, u12);
394   w13 = _mm_sub_epi32(u5, u13);
395   w14 = _mm_sub_epi32(u6, u14);
396   w15 = _mm_sub_epi32(u7, u15);
397
398   // shift and rounding
399   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
400   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
401   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
402   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
403   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
404   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
405   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
406   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
407   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
408   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
409   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
410   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
411   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
412   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
413   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
414   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
415
416   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
417   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
418   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
419   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
420   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
421   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
422   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
423   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
424   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
425   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
426   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
427   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
428   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
429   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
430   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
431   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
432
433   // back to 16-bit and pack 8 integers into __m128i
434   in[0] = _mm_packs_epi32(u0, u1);
435   in[1] = _mm_packs_epi32(u2, u3);
436   in[2] = _mm_packs_epi32(u4, u5);
437   in[3] = _mm_packs_epi32(u6, u7);
438   in[4] = _mm_packs_epi32(u8, u9);
439   in[5] = _mm_packs_epi32(u10, u11);
440   in[6] = _mm_packs_epi32(u12, u13);
441   in[7] = _mm_packs_epi32(u14, u15);
442
443   // stage 2
444   s0 = _mm_add_epi16(in[0], in[2]);
445   s1 = _mm_add_epi16(in[1], in[3]);
446   s2 = _mm_sub_epi16(in[0], in[2]);
447   s3 = _mm_sub_epi16(in[1], in[3]);
448   u0 = _mm_unpacklo_epi16(in[4], in[5]);
449   u1 = _mm_unpackhi_epi16(in[4], in[5]);
450   u2 = _mm_unpacklo_epi16(in[6], in[7]);
451   u3 = _mm_unpackhi_epi16(in[6], in[7]);
452
453   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
454   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
455   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
456   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
457   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
458   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
459   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
460   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
461
462   w0 = _mm_add_epi32(v0, v4);
463   w1 = _mm_add_epi32(v1, v5);
464   w2 = _mm_add_epi32(v2, v6);
465   w3 = _mm_add_epi32(v3, v7);
466   w4 = _mm_sub_epi32(v0, v4);
467   w5 = _mm_sub_epi32(v1, v5);
468   w6 = _mm_sub_epi32(v2, v6);
469   w7 = _mm_sub_epi32(v3, v7);
470
471   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
472   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
473   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
474   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
475   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
476   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
477   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
478   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
479
480   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
481   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
482   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
483   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
484   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
485   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
486   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
487   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
488
489   // back to 16-bit intergers
490   s4 = _mm_packs_epi32(u0, u1);
491   s5 = _mm_packs_epi32(u2, u3);
492   s6 = _mm_packs_epi32(u4, u5);
493   s7 = _mm_packs_epi32(u6, u7);
494
495   // stage 3
496   u0 = _mm_unpacklo_epi16(s2, s3);
497   u1 = _mm_unpackhi_epi16(s2, s3);
498   u2 = _mm_unpacklo_epi16(s6, s7);
499   u3 = _mm_unpackhi_epi16(s6, s7);
500
501   s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
502   s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
503   s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
504   s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
505
506   in[0] = s0;
507   in[1] = _mm_sub_epi16(k__const_0, s4);
508   in[2] = s6;
509   in[3] = _mm_sub_epi16(k__const_0, s2);
510   in[4] = s3;
511   in[5] = _mm_sub_epi16(k__const_0, s7);
512   in[6] = s5;
513   in[7] = _mm_sub_epi16(k__const_0, s1);
514 }
515
516 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
517                              int stride) {
518   const __m128i zero = _mm_setzero_si128();
519   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
520   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
521   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
522   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
523   const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
524   const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
525   const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
526   const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
527   const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
528   const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
529
530   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
531   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
532   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
533   __m128i tmp0, tmp1, tmp2, tmp3;
534
535   // Rows. Load 4-row input data.
536   in0 = load_input_data(input);
537   in1 = load_input_data(input + 8 * 1);
538   in2 = load_input_data(input + 8 * 2);
539   in3 = load_input_data(input + 8 * 3);
540
541   // 8x4 Transpose
542   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
543   // Stage1
544   {
545     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
546     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
547
548     stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
549     stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
550   }
551
552   // Stage2
553   {
554     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
555     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
556
557     stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
558     stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
559
560     tmp0 = _mm_add_epi16(stp1_4, stp1_5);
561     tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
562
563     stp2_4 = tmp0;
564     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
565     stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
566   }
567
568   // Stage3
569   {
570     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
571
572     tmp0 = _mm_add_epi16(stp2_0, stp2_2);
573     tmp1 = _mm_sub_epi16(stp2_0, stp2_2);
574     stp1_2 = _mm_unpackhi_epi64(tmp1, tmp0);
575     stp1_3 = _mm_unpacklo_epi64(tmp1, tmp0);
576     stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56);  // stg3_1 = stg2_0
577   }
578
579   // Stage4
580   tmp0 = _mm_add_epi16(stp1_3, stp2_4);
581   tmp1 = _mm_add_epi16(stp1_2, stp1_5);
582   tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
583   tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
584
585   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
586
587   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
588         in5, in6, in7);
589   // Final rounding and shift
590   in0 = _mm_adds_epi16(in0, final_rounding);
591   in1 = _mm_adds_epi16(in1, final_rounding);
592   in2 = _mm_adds_epi16(in2, final_rounding);
593   in3 = _mm_adds_epi16(in3, final_rounding);
594   in4 = _mm_adds_epi16(in4, final_rounding);
595   in5 = _mm_adds_epi16(in5, final_rounding);
596   in6 = _mm_adds_epi16(in6, final_rounding);
597   in7 = _mm_adds_epi16(in7, final_rounding);
598
599   in0 = _mm_srai_epi16(in0, 5);
600   in1 = _mm_srai_epi16(in1, 5);
601   in2 = _mm_srai_epi16(in2, 5);
602   in3 = _mm_srai_epi16(in3, 5);
603   in4 = _mm_srai_epi16(in4, 5);
604   in5 = _mm_srai_epi16(in5, 5);
605   in6 = _mm_srai_epi16(in6, 5);
606   in7 = _mm_srai_epi16(in7, 5);
607
608   RECON_AND_STORE(dest + 0 * stride, in0);
609   RECON_AND_STORE(dest + 1 * stride, in1);
610   RECON_AND_STORE(dest + 2 * stride, in2);
611   RECON_AND_STORE(dest + 3 * stride, in3);
612   RECON_AND_STORE(dest + 4 * stride, in4);
613   RECON_AND_STORE(dest + 5 * stride, in5);
614   RECON_AND_STORE(dest + 6 * stride, in6);
615   RECON_AND_STORE(dest + 7 * stride, in7);
616 }
617
618 #define IDCT16                                                                 \
619   /* Stage2 */                                                                 \
620   {                                                                            \
621     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
622     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
623     const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
624     const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
625     const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
626     const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
627     const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
628     const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
629                                                                                \
630     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
631                            stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
632                                                                                \
633     MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
634                            stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
635   }                                                                            \
636                                                                                \
637   /* Stage3 */                                                                 \
638   {                                                                            \
639     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
640     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
641     const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
642     const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
643                                                                                \
644     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
645                            stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
646                                                                                \
647     stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
648     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
649     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
650     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
651                                                                                \
652     stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
653     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
654     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
655     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
656   }                                                                            \
657                                                                                \
658   /* Stage4 */                                                                 \
659   {                                                                            \
660     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
661     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
662     const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
663     const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
664                                                                                \
665     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
666     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
667     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
668     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
669                                                                                \
670     MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
671                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
672                                                                                \
673     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
674     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
675     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
676     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
677                                                                                \
678     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
679                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
680                            stp2_13)                                            \
681   }                                                                            \
682                                                                                \
683   /* Stage5 */                                                                 \
684   {                                                                            \
685     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
686     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
687                                                                                \
688     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
689     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
690     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
691     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
692                                                                                \
693     stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
694     stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
695                                                                                \
696     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
697     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
698     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
699     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
700                                                                                \
701     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
702     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
703     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
704     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
705   }                                                                            \
706                                                                                \
707   /* Stage6 */                                                                 \
708   {                                                                            \
709     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
710     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
711     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
712     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
713                                                                                \
714     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
715     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
716     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
717     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
718     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
719     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
720     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
721     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
722                                                                                \
723     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
724                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
725                            stp2_12)                                            \
726   }
727
728 #define IDCT16_10                                                              \
729   /* Stage2 */                                                                 \
730   {                                                                            \
731     const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
732     const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
733     const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
734     const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
735                                                                                \
736     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
737                            stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
738                            stp1_12_0)                                          \
739   }                                                                            \
740                                                                                \
741   /* Stage3 */                                                                 \
742   {                                                                            \
743     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
744     const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
745                                                                                \
746     MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
747                                                                                \
748     stp1_9 = stp1_8_0;                                                         \
749     stp1_10 = stp1_11;                                                         \
750                                                                                \
751     stp1_13 = stp1_12_0;                                                       \
752     stp1_14 = stp1_15;                                                         \
753   }                                                                            \
754                                                                                \
755   /* Stage4 */                                                                 \
756   {                                                                            \
757     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
758     const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
759                                                                                \
760     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
761     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
762     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
763     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
764                                                                                \
765     MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
766     stp2_5 = stp2_4;                                                           \
767     stp2_6 = stp2_7;                                                           \
768                                                                                \
769     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
770                            stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
771                            stp2_13)                                            \
772   }                                                                            \
773                                                                                \
774   /* Stage5 */                                                                 \
775   {                                                                            \
776     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
777     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
778                                                                                \
779     stp1_2 = stp1_1;                                                           \
780     stp1_3 = stp1_0;                                                           \
781                                                                                \
782     stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
783     stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
784                                                                                \
785     stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
786     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
787     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
788     stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
789                                                                                \
790     stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
791     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
792     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
793     stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
794   }                                                                            \
795                                                                                \
796   /* Stage6 */                                                                 \
797   {                                                                            \
798     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
799     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
800     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
801     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
802                                                                                \
803     stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
804     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
805     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
806     stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
807     stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
808     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
809     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
810     stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
811                                                                                \
812     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
813                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
814                            stp2_12)                                            \
815   }
816
817 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
818                                 int stride) {
819   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
820   const __m128i zero = _mm_setzero_si128();
821
822   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
823   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
824   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
825   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
826   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
827   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
828   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
829   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
830
831   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
832   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
833   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
834   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
835
836   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
837   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
838   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
839   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
840   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
841   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
842   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
843   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
844
845   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
846
847   __m128i in[16], l[16], r[16], *curr1;
848   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
849       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
850       stp1_8_0, stp1_12_0;
851   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
852       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
853   int i;
854
855   curr1 = l;
856   for (i = 0; i < 2; i++) {
857     // 1-D idct
858
859     // Load input data.
860     in[0] = load_input_data(input);
861     in[8] = load_input_data(input + 8 * 1);
862     in[1] = load_input_data(input + 8 * 2);
863     in[9] = load_input_data(input + 8 * 3);
864     in[2] = load_input_data(input + 8 * 4);
865     in[10] = load_input_data(input + 8 * 5);
866     in[3] = load_input_data(input + 8 * 6);
867     in[11] = load_input_data(input + 8 * 7);
868     in[4] = load_input_data(input + 8 * 8);
869     in[12] = load_input_data(input + 8 * 9);
870     in[5] = load_input_data(input + 8 * 10);
871     in[13] = load_input_data(input + 8 * 11);
872     in[6] = load_input_data(input + 8 * 12);
873     in[14] = load_input_data(input + 8 * 13);
874     in[7] = load_input_data(input + 8 * 14);
875     in[15] = load_input_data(input + 8 * 15);
876
877     array_transpose_8x8(in, in);
878     array_transpose_8x8(in + 8, in + 8);
879
880     IDCT16
881
882     // Stage7
883     curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
884     curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
885     curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
886     curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
887     curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
888     curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
889     curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
890     curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
891     curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
892     curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
893     curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
894     curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
895     curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
896     curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
897     curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
898     curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
899
900     curr1 = r;
901     input += 128;
902   }
903   for (i = 0; i < 2; i++) {
904     int j;
905     // 1-D idct
906     array_transpose_8x8(l + i * 8, in);
907     array_transpose_8x8(r + i * 8, in + 8);
908
909     IDCT16
910
911     // 2-D
912     in[0] = _mm_add_epi16(stp2_0, stp1_15);
913     in[1] = _mm_add_epi16(stp2_1, stp1_14);
914     in[2] = _mm_add_epi16(stp2_2, stp2_13);
915     in[3] = _mm_add_epi16(stp2_3, stp2_12);
916     in[4] = _mm_add_epi16(stp2_4, stp2_11);
917     in[5] = _mm_add_epi16(stp2_5, stp2_10);
918     in[6] = _mm_add_epi16(stp2_6, stp1_9);
919     in[7] = _mm_add_epi16(stp2_7, stp1_8);
920     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
921     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
922     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
923     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
924     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
925     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
926     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
927     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
928
929     for (j = 0; j < 16; ++j) {
930       // Final rounding and shift
931       in[j] = _mm_adds_epi16(in[j], final_rounding);
932       in[j] = _mm_srai_epi16(in[j], 6);
933       RECON_AND_STORE(dest + j * stride, in[j]);
934     }
935
936     dest += 8;
937   }
938 }
939
940 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
941                               int stride) {
942   __m128i dc_value;
943   const __m128i zero = _mm_setzero_si128();
944   int a, i;
945
946   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
947   a = (int)dct_const_round_shift(a * cospi_16_64);
948   a = ROUND_POWER_OF_TWO(a, 6);
949
950   dc_value = _mm_set1_epi16(a);
951
952   for (i = 0; i < 16; ++i) {
953     RECON_AND_STORE(dest + 0, dc_value);
954     RECON_AND_STORE(dest + 8, dc_value);
955     dest += stride;
956   }
957 }
958
959 static void iadst16_8col(__m128i *in) {
960   // perform 16x16 1-D ADST for 8 columns
961   __m128i s[16], x[16], u[32], v[32];
962   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
963   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
964   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
965   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
966   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
967   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
968   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
969   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
970   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
971   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
972   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
973   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
974   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
975   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
976   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
977   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
978   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
979   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
980   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
981   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
982   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
983   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
984   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
985   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
986   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
987   const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
988   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
989   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
990   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
991   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
992   const __m128i kZero = _mm_set1_epi16(0);
993
994   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
995   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
996   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
997   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
998   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
999   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1000   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1001   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1002   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1003   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1004   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1005   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1006   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1007   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1008   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1009   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1010
1011   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1012   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1013   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1014   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1015   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1016   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1017   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1018   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1019   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1020   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1021   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1022   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1023   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1024   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1025   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1026   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1027   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1028   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1029   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1030   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1031   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1032   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1033   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1034   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1035   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1036   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1037   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1038   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1039   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1040   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1041   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1042   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1043
1044   u[0] = _mm_add_epi32(v[0], v[16]);
1045   u[1] = _mm_add_epi32(v[1], v[17]);
1046   u[2] = _mm_add_epi32(v[2], v[18]);
1047   u[3] = _mm_add_epi32(v[3], v[19]);
1048   u[4] = _mm_add_epi32(v[4], v[20]);
1049   u[5] = _mm_add_epi32(v[5], v[21]);
1050   u[6] = _mm_add_epi32(v[6], v[22]);
1051   u[7] = _mm_add_epi32(v[7], v[23]);
1052   u[8] = _mm_add_epi32(v[8], v[24]);
1053   u[9] = _mm_add_epi32(v[9], v[25]);
1054   u[10] = _mm_add_epi32(v[10], v[26]);
1055   u[11] = _mm_add_epi32(v[11], v[27]);
1056   u[12] = _mm_add_epi32(v[12], v[28]);
1057   u[13] = _mm_add_epi32(v[13], v[29]);
1058   u[14] = _mm_add_epi32(v[14], v[30]);
1059   u[15] = _mm_add_epi32(v[15], v[31]);
1060   u[16] = _mm_sub_epi32(v[0], v[16]);
1061   u[17] = _mm_sub_epi32(v[1], v[17]);
1062   u[18] = _mm_sub_epi32(v[2], v[18]);
1063   u[19] = _mm_sub_epi32(v[3], v[19]);
1064   u[20] = _mm_sub_epi32(v[4], v[20]);
1065   u[21] = _mm_sub_epi32(v[5], v[21]);
1066   u[22] = _mm_sub_epi32(v[6], v[22]);
1067   u[23] = _mm_sub_epi32(v[7], v[23]);
1068   u[24] = _mm_sub_epi32(v[8], v[24]);
1069   u[25] = _mm_sub_epi32(v[9], v[25]);
1070   u[26] = _mm_sub_epi32(v[10], v[26]);
1071   u[27] = _mm_sub_epi32(v[11], v[27]);
1072   u[28] = _mm_sub_epi32(v[12], v[28]);
1073   u[29] = _mm_sub_epi32(v[13], v[29]);
1074   u[30] = _mm_sub_epi32(v[14], v[30]);
1075   u[31] = _mm_sub_epi32(v[15], v[31]);
1076
1077   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1078   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1079   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1080   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1081   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1082   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1083   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1084   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1085   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1086   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1087   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1088   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1089   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1090   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1091   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1092   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1093   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1094   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1095   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1096   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1097   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1098   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1099   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1100   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1101   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1102   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1103   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1104   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1105   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1106   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1107   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1108   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1109
1110   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1111   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1112   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1113   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1114   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1115   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1116   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1117   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1118   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1119   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1120   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1121   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1122   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1123   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1124   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1125   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1126   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1127   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1128   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1129   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1130   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1131   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1132   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1133   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1134   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1135   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1136   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1137   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1138   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1139   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1140   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1141   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1142
1143   s[0] = _mm_packs_epi32(u[0], u[1]);
1144   s[1] = _mm_packs_epi32(u[2], u[3]);
1145   s[2] = _mm_packs_epi32(u[4], u[5]);
1146   s[3] = _mm_packs_epi32(u[6], u[7]);
1147   s[4] = _mm_packs_epi32(u[8], u[9]);
1148   s[5] = _mm_packs_epi32(u[10], u[11]);
1149   s[6] = _mm_packs_epi32(u[12], u[13]);
1150   s[7] = _mm_packs_epi32(u[14], u[15]);
1151   s[8] = _mm_packs_epi32(u[16], u[17]);
1152   s[9] = _mm_packs_epi32(u[18], u[19]);
1153   s[10] = _mm_packs_epi32(u[20], u[21]);
1154   s[11] = _mm_packs_epi32(u[22], u[23]);
1155   s[12] = _mm_packs_epi32(u[24], u[25]);
1156   s[13] = _mm_packs_epi32(u[26], u[27]);
1157   s[14] = _mm_packs_epi32(u[28], u[29]);
1158   s[15] = _mm_packs_epi32(u[30], u[31]);
1159
1160   // stage 2
1161   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1162   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1163   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1164   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1165   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1166   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1167   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1168   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1169
1170   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1171   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1172   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1173   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1174   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1175   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1176   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1177   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1178   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1179   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1180   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1181   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1182   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1183   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1184   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1185   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1186
1187   u[0] = _mm_add_epi32(v[0], v[8]);
1188   u[1] = _mm_add_epi32(v[1], v[9]);
1189   u[2] = _mm_add_epi32(v[2], v[10]);
1190   u[3] = _mm_add_epi32(v[3], v[11]);
1191   u[4] = _mm_add_epi32(v[4], v[12]);
1192   u[5] = _mm_add_epi32(v[5], v[13]);
1193   u[6] = _mm_add_epi32(v[6], v[14]);
1194   u[7] = _mm_add_epi32(v[7], v[15]);
1195   u[8] = _mm_sub_epi32(v[0], v[8]);
1196   u[9] = _mm_sub_epi32(v[1], v[9]);
1197   u[10] = _mm_sub_epi32(v[2], v[10]);
1198   u[11] = _mm_sub_epi32(v[3], v[11]);
1199   u[12] = _mm_sub_epi32(v[4], v[12]);
1200   u[13] = _mm_sub_epi32(v[5], v[13]);
1201   u[14] = _mm_sub_epi32(v[6], v[14]);
1202   u[15] = _mm_sub_epi32(v[7], v[15]);
1203
1204   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1205   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1206   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1207   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1208   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1209   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1210   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1211   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1212   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1213   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1214   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1215   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1216   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1217   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1218   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1219   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1220
1221   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1222   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1223   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1224   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1225   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1226   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1227   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1228   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1229   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1230   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1231   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1232   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1233   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1234   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1235   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1236   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1237
1238   x[0] = _mm_add_epi16(s[0], s[4]);
1239   x[1] = _mm_add_epi16(s[1], s[5]);
1240   x[2] = _mm_add_epi16(s[2], s[6]);
1241   x[3] = _mm_add_epi16(s[3], s[7]);
1242   x[4] = _mm_sub_epi16(s[0], s[4]);
1243   x[5] = _mm_sub_epi16(s[1], s[5]);
1244   x[6] = _mm_sub_epi16(s[2], s[6]);
1245   x[7] = _mm_sub_epi16(s[3], s[7]);
1246   x[8] = _mm_packs_epi32(u[0], u[1]);
1247   x[9] = _mm_packs_epi32(u[2], u[3]);
1248   x[10] = _mm_packs_epi32(u[4], u[5]);
1249   x[11] = _mm_packs_epi32(u[6], u[7]);
1250   x[12] = _mm_packs_epi32(u[8], u[9]);
1251   x[13] = _mm_packs_epi32(u[10], u[11]);
1252   x[14] = _mm_packs_epi32(u[12], u[13]);
1253   x[15] = _mm_packs_epi32(u[14], u[15]);
1254
1255   // stage 3
1256   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1257   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1258   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1259   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1260   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1261   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1262   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1263   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1264
1265   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1266   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1267   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1268   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1269   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1270   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1271   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1272   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1273   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1274   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1275   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1276   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1277   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1278   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1279   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1280   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1281
1282   u[0] = _mm_add_epi32(v[0], v[4]);
1283   u[1] = _mm_add_epi32(v[1], v[5]);
1284   u[2] = _mm_add_epi32(v[2], v[6]);
1285   u[3] = _mm_add_epi32(v[3], v[7]);
1286   u[4] = _mm_sub_epi32(v[0], v[4]);
1287   u[5] = _mm_sub_epi32(v[1], v[5]);
1288   u[6] = _mm_sub_epi32(v[2], v[6]);
1289   u[7] = _mm_sub_epi32(v[3], v[7]);
1290   u[8] = _mm_add_epi32(v[8], v[12]);
1291   u[9] = _mm_add_epi32(v[9], v[13]);
1292   u[10] = _mm_add_epi32(v[10], v[14]);
1293   u[11] = _mm_add_epi32(v[11], v[15]);
1294   u[12] = _mm_sub_epi32(v[8], v[12]);
1295   u[13] = _mm_sub_epi32(v[9], v[13]);
1296   u[14] = _mm_sub_epi32(v[10], v[14]);
1297   u[15] = _mm_sub_epi32(v[11], v[15]);
1298
1299   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1300   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1301   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1302   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1303   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1304   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1305   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1306   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1307   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1308   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1309   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1310   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1311   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1312   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1313   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1314   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1315
1316   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1317   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1318   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1319   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1320   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1321   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1322   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1323   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1324   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1325   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1326   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1327   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1328   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1329   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1330   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1331   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1332
1333   s[0] = _mm_add_epi16(x[0], x[2]);
1334   s[1] = _mm_add_epi16(x[1], x[3]);
1335   s[2] = _mm_sub_epi16(x[0], x[2]);
1336   s[3] = _mm_sub_epi16(x[1], x[3]);
1337   s[4] = _mm_packs_epi32(v[0], v[1]);
1338   s[5] = _mm_packs_epi32(v[2], v[3]);
1339   s[6] = _mm_packs_epi32(v[4], v[5]);
1340   s[7] = _mm_packs_epi32(v[6], v[7]);
1341   s[8] = _mm_add_epi16(x[8], x[10]);
1342   s[9] = _mm_add_epi16(x[9], x[11]);
1343   s[10] = _mm_sub_epi16(x[8], x[10]);
1344   s[11] = _mm_sub_epi16(x[9], x[11]);
1345   s[12] = _mm_packs_epi32(v[8], v[9]);
1346   s[13] = _mm_packs_epi32(v[10], v[11]);
1347   s[14] = _mm_packs_epi32(v[12], v[13]);
1348   s[15] = _mm_packs_epi32(v[14], v[15]);
1349
1350   // stage 4
1351   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1352   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1353   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1354   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1355   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1356   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1357   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1358   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1359
1360   in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
1361   in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
1362   in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
1363   in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
1364   in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
1365   in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
1366   in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
1367   in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
1368
1369   in[0] = s[0];
1370   in[1] = _mm_sub_epi16(kZero, s[8]);
1371   in[2] = s[12];
1372   in[3] = _mm_sub_epi16(kZero, s[4]);
1373   in[12] = s[5];
1374   in[13] = _mm_sub_epi16(kZero, s[13]);
1375   in[14] = s[9];
1376   in[15] = _mm_sub_epi16(kZero, s[1]);
1377 }
1378
1379 static void idct16_8col(__m128i *in) {
1380   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1381   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
1382   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
1383   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
1384   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
1385   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
1386   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1387   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
1388   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1389   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1390   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1391   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1392   const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
1393   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1394   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1395   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1396   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1397   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
1398   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1399   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1400   __m128i u[16], s[16], t[16];
1401
1402   // stage 1
1403   s[0] = in[0];
1404   s[1] = in[8];
1405   s[2] = in[4];
1406   s[3] = in[12];
1407   s[4] = in[2];
1408   s[5] = in[10];
1409   s[6] = in[6];
1410   s[7] = in[14];
1411   s[8] = in[1];
1412   s[9] = in[9];
1413   s[10] = in[5];
1414   s[11] = in[13];
1415   s[12] = in[3];
1416   s[13] = in[11];
1417   s[14] = in[7];
1418   s[15] = in[15];
1419
1420   // stage 2
1421   u[0] = _mm_unpacklo_epi16(s[8], s[15]);
1422   u[1] = _mm_unpackhi_epi16(s[8], s[15]);
1423   u[2] = _mm_unpacklo_epi16(s[9], s[14]);
1424   u[3] = _mm_unpackhi_epi16(s[9], s[14]);
1425   u[4] = _mm_unpacklo_epi16(s[10], s[13]);
1426   u[5] = _mm_unpackhi_epi16(s[10], s[13]);
1427   u[6] = _mm_unpacklo_epi16(s[11], s[12]);
1428   u[7] = _mm_unpackhi_epi16(s[11], s[12]);
1429
1430   s[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p30_m02);
1431   s[15] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p02_p30);
1432   s[9] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p14_m18);
1433   s[14] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p18_p14);
1434   s[10] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p22_m10);
1435   s[13] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p10_p22);
1436   s[11] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p06_m26);
1437   s[12] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p26_p06);
1438
1439   // stage 3
1440   t[0] = s[0];
1441   t[1] = s[1];
1442   t[2] = s[2];
1443   t[3] = s[3];
1444   u[0] = _mm_unpacklo_epi16(s[4], s[7]);
1445   u[1] = _mm_unpackhi_epi16(s[4], s[7]);
1446   u[2] = _mm_unpacklo_epi16(s[5], s[6]);
1447   u[3] = _mm_unpackhi_epi16(s[5], s[6]);
1448
1449   t[4] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p28_m04);
1450   t[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p04_p28);
1451   t[5] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p12_m20);
1452   t[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p20_p12);
1453   t[8] = _mm_add_epi16(s[8], s[9]);
1454   t[9] = _mm_sub_epi16(s[8], s[9]);
1455   t[10] = _mm_sub_epi16(s[11], s[10]);
1456   t[11] = _mm_add_epi16(s[10], s[11]);
1457   t[12] = _mm_add_epi16(s[12], s[13]);
1458   t[13] = _mm_sub_epi16(s[12], s[13]);
1459   t[14] = _mm_sub_epi16(s[15], s[14]);
1460   t[15] = _mm_add_epi16(s[14], s[15]);
1461
1462   // stage 4
1463   u[0] = _mm_unpacklo_epi16(t[0], t[1]);
1464   u[1] = _mm_unpackhi_epi16(t[0], t[1]);
1465   u[2] = _mm_unpacklo_epi16(t[2], t[3]);
1466   u[3] = _mm_unpackhi_epi16(t[2], t[3]);
1467   u[4] = _mm_unpacklo_epi16(t[9], t[14]);
1468   u[5] = _mm_unpackhi_epi16(t[9], t[14]);
1469   u[6] = _mm_unpacklo_epi16(t[10], t[13]);
1470   u[7] = _mm_unpackhi_epi16(t[10], t[13]);
1471
1472   s[0] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
1473   s[1] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
1474   s[2] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p24_m08);
1475   s[3] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p08_p24);
1476   s[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m08_p24);
1477   s[14] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p24_p08);
1478   s[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m24_m08);
1479   s[13] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m08_p24);
1480   s[4] = _mm_add_epi16(t[4], t[5]);
1481   s[5] = _mm_sub_epi16(t[4], t[5]);
1482   s[6] = _mm_sub_epi16(t[7], t[6]);
1483   s[7] = _mm_add_epi16(t[6], t[7]);
1484   s[8] = t[8];
1485   s[15] = t[15];
1486   s[11] = t[11];
1487   s[12] = t[12];
1488
1489   // stage 5
1490   t[0] = _mm_add_epi16(s[0], s[3]);
1491   t[1] = _mm_add_epi16(s[1], s[2]);
1492   t[2] = _mm_sub_epi16(s[1], s[2]);
1493   t[3] = _mm_sub_epi16(s[0], s[3]);
1494   t[4] = s[4];
1495   t[7] = s[7];
1496
1497   u[0] = _mm_unpacklo_epi16(s[5], s[6]);
1498   u[1] = _mm_unpackhi_epi16(s[5], s[6]);
1499   t[5] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
1500   t[6] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
1501
1502   t[8] = _mm_add_epi16(s[8], s[11]);
1503   t[9] = _mm_add_epi16(s[9], s[10]);
1504   t[10] = _mm_sub_epi16(s[9], s[10]);
1505   t[11] = _mm_sub_epi16(s[8], s[11]);
1506   t[12] = _mm_sub_epi16(s[15], s[12]);
1507   t[13] = _mm_sub_epi16(s[14], s[13]);
1508   t[14] = _mm_add_epi16(s[13], s[14]);
1509   t[15] = _mm_add_epi16(s[12], s[15]);
1510
1511   // stage 6
1512   s[0] = _mm_add_epi16(t[0], t[7]);
1513   s[1] = _mm_add_epi16(t[1], t[6]);
1514   s[2] = _mm_add_epi16(t[2], t[5]);
1515   s[3] = _mm_add_epi16(t[3], t[4]);
1516   s[4] = _mm_sub_epi16(t[3], t[4]);
1517   s[5] = _mm_sub_epi16(t[2], t[5]);
1518   s[6] = _mm_sub_epi16(t[1], t[6]);
1519   s[7] = _mm_sub_epi16(t[0], t[7]);
1520   s[8] = t[8];
1521   s[9] = t[9];
1522
1523   u[0] = _mm_unpacklo_epi16(t[10], t[13]);
1524   u[1] = _mm_unpackhi_epi16(t[10], t[13]);
1525   u[2] = _mm_unpacklo_epi16(t[11], t[12]);
1526   u[3] = _mm_unpackhi_epi16(t[11], t[12]);
1527
1528   s[10] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
1529   s[13] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
1530   s[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
1531   s[12] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
1532   s[14] = t[14];
1533   s[15] = t[15];
1534
1535   // stage 7
1536   in[0] = _mm_add_epi16(s[0], s[15]);
1537   in[1] = _mm_add_epi16(s[1], s[14]);
1538   in[2] = _mm_add_epi16(s[2], s[13]);
1539   in[3] = _mm_add_epi16(s[3], s[12]);
1540   in[4] = _mm_add_epi16(s[4], s[11]);
1541   in[5] = _mm_add_epi16(s[5], s[10]);
1542   in[6] = _mm_add_epi16(s[6], s[9]);
1543   in[7] = _mm_add_epi16(s[7], s[8]);
1544   in[8] = _mm_sub_epi16(s[7], s[8]);
1545   in[9] = _mm_sub_epi16(s[6], s[9]);
1546   in[10] = _mm_sub_epi16(s[5], s[10]);
1547   in[11] = _mm_sub_epi16(s[4], s[11]);
1548   in[12] = _mm_sub_epi16(s[3], s[12]);
1549   in[13] = _mm_sub_epi16(s[2], s[13]);
1550   in[14] = _mm_sub_epi16(s[1], s[14]);
1551   in[15] = _mm_sub_epi16(s[0], s[15]);
1552 }
1553
1554 void idct16_sse2(__m128i *in0, __m128i *in1) {
1555   array_transpose_16x16(in0, in1);
1556   idct16_8col(in0);
1557   idct16_8col(in1);
1558 }
1559
1560 void iadst16_sse2(__m128i *in0, __m128i *in1) {
1561   array_transpose_16x16(in0, in1);
1562   iadst16_8col(in0);
1563   iadst16_8col(in1);
1564 }
1565
1566 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
1567                                int stride) {
1568   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
1569   const __m128i zero = _mm_setzero_si128();
1570
1571   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
1572   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
1573   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
1574   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
1575
1576   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1577   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
1578
1579   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
1580   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1581   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1582   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
1583   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
1584   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
1585
1586   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1587   __m128i in[16], l[16];
1588   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
1589       stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
1590       stp1_12_0;
1591   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
1592       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
1593   __m128i tmp0, tmp1, tmp2, tmp3;
1594   int i;
1595   // First 1-D inverse DCT
1596   // Load input data.
1597   in[0] = load_input_data(input);
1598   in[1] = load_input_data(input + 8 * 2);
1599   in[2] = load_input_data(input + 8 * 4);
1600   in[3] = load_input_data(input + 8 * 6);
1601
1602   TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
1603
1604   // Stage2
1605   {
1606     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
1607     const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
1608
1609     stp2_8 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_1_15);
1610     stp2_11 = idct_calc_wraplow_sse2(stg2_6, stg2_7, lo_13_3);
1611   }
1612
1613   // Stage3
1614   {
1615     const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
1616
1617     stp1_4 = idct_calc_wraplow_sse2(stg3_0, stg3_1, lo_2_14);
1618     stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
1619     stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
1620   }
1621
1622   // Stage4
1623   {
1624     const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
1625     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
1626     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
1627
1628     tmp0 = idct_madd_round_shift_sse2(lo_0_8, stg4_0);
1629     tmp1 = idct_madd_round_shift_sse2(lo_0_8, stg4_1);
1630     stp1_0 = _mm_packs_epi32(tmp0, tmp0);
1631     stp1_1 = _mm_packs_epi32(tmp1, tmp1);
1632     stp2_9 = idct_calc_wraplow_sse2(stg4_4, stg4_5, lo_9_14);
1633     stp2_10 = idct_calc_wraplow_sse2(stg4_6, stg4_7, lo_10_13);
1634
1635     stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
1636   }
1637
1638   // Stage5 and Stage6
1639   {
1640     tmp0 = _mm_add_epi16(stp2_8, stp2_11);
1641     tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
1642     tmp2 = _mm_add_epi16(stp2_9, stp2_10);
1643     tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
1644
1645     stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
1646     stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
1647     stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
1648     stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
1649
1650     stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
1651     stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
1652     stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
1653     stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
1654   }
1655
1656   // Stage6
1657   {
1658     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
1659     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
1660     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
1661
1662     stp1_6 = idct_calc_wraplow_sse2(stg4_0, stg4_1, lo_6_5);
1663     tmp0 = idct_madd_round_shift_sse2(lo_10_13, stg6_0);
1664     tmp1 = idct_madd_round_shift_sse2(lo_10_13, stg4_0);
1665     tmp2 = idct_madd_round_shift_sse2(lo_11_12, stg6_0);
1666     tmp3 = idct_madd_round_shift_sse2(lo_11_12, stg4_0);
1667
1668     stp2_10 = _mm_packs_epi32(tmp0, zero);
1669     stp2_13 = _mm_packs_epi32(tmp1, zero);
1670     stp2_11 = _mm_packs_epi32(tmp2, zero);
1671     stp2_12 = _mm_packs_epi32(tmp3, zero);
1672
1673     tmp0 = _mm_add_epi16(stp1_0, stp1_4);
1674     tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
1675     tmp2 = _mm_add_epi16(stp1_1, stp1_6);
1676     tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
1677
1678     stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
1679     stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
1680     stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
1681     stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
1682     stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
1683     stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
1684     stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
1685     stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
1686   }
1687
1688   // Stage7. Left 8x16 only.
1689   l[0] = _mm_add_epi16(stp2_0, stp1_15);
1690   l[1] = _mm_add_epi16(stp2_1, stp1_14);
1691   l[2] = _mm_add_epi16(stp2_2, stp2_13);
1692   l[3] = _mm_add_epi16(stp2_3, stp2_12);
1693   l[4] = _mm_add_epi16(stp2_4, stp2_11);
1694   l[5] = _mm_add_epi16(stp2_5, stp2_10);
1695   l[6] = _mm_add_epi16(stp2_6, stp1_9);
1696   l[7] = _mm_add_epi16(stp2_7, stp1_8);
1697   l[8] = _mm_sub_epi16(stp2_7, stp1_8);
1698   l[9] = _mm_sub_epi16(stp2_6, stp1_9);
1699   l[10] = _mm_sub_epi16(stp2_5, stp2_10);
1700   l[11] = _mm_sub_epi16(stp2_4, stp2_11);
1701   l[12] = _mm_sub_epi16(stp2_3, stp2_12);
1702   l[13] = _mm_sub_epi16(stp2_2, stp2_13);
1703   l[14] = _mm_sub_epi16(stp2_1, stp1_14);
1704   l[15] = _mm_sub_epi16(stp2_0, stp1_15);
1705
1706   // Second 1-D inverse transform, performed per 8x16 block
1707   for (i = 0; i < 2; i++) {
1708     int j;
1709     array_transpose_4X8(l + 8 * i, in);
1710
1711     IDCT16_10
1712
1713     // Stage7
1714     in[0] = _mm_add_epi16(stp2_0, stp1_15);
1715     in[1] = _mm_add_epi16(stp2_1, stp1_14);
1716     in[2] = _mm_add_epi16(stp2_2, stp2_13);
1717     in[3] = _mm_add_epi16(stp2_3, stp2_12);
1718     in[4] = _mm_add_epi16(stp2_4, stp2_11);
1719     in[5] = _mm_add_epi16(stp2_5, stp2_10);
1720     in[6] = _mm_add_epi16(stp2_6, stp1_9);
1721     in[7] = _mm_add_epi16(stp2_7, stp1_8);
1722     in[8] = _mm_sub_epi16(stp2_7, stp1_8);
1723     in[9] = _mm_sub_epi16(stp2_6, stp1_9);
1724     in[10] = _mm_sub_epi16(stp2_5, stp2_10);
1725     in[11] = _mm_sub_epi16(stp2_4, stp2_11);
1726     in[12] = _mm_sub_epi16(stp2_3, stp2_12);
1727     in[13] = _mm_sub_epi16(stp2_2, stp2_13);
1728     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
1729     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
1730
1731     for (j = 0; j < 16; ++j) {
1732       // Final rounding and shift
1733       in[j] = _mm_adds_epi16(in[j], final_rounding);
1734       in[j] = _mm_srai_epi16(in[j], 6);
1735       RECON_AND_STORE(dest + j * stride, in[j]);
1736     }
1737
1738     dest += 8;
1739   }
1740 }
1741
1742 #define LOAD_DQCOEFF(reg, input)  \
1743   {                               \
1744     reg = load_input_data(input); \
1745     input += 8;                   \
1746   }
1747
1748 #define IDCT32_34                                                              \
1749   /* Stage1 */                                                                 \
1750   {                                                                            \
1751     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
1752     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
1753                                                                                \
1754     const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
1755     const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
1756                                                                                \
1757     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
1758     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
1759                                                                                \
1760     const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
1761     const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
1762                                                                                \
1763     MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
1764                              stp1_31);                                         \
1765     MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
1766                              stp1_28);                                         \
1767     MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
1768                              stp1_27);                                         \
1769     MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
1770                              stp1_24);                                         \
1771   }                                                                            \
1772                                                                                \
1773   /* Stage2 */                                                                 \
1774   {                                                                            \
1775     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
1776     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
1777                                                                                \
1778     const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
1779     const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
1780                                                                                \
1781     MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
1782                              stp2_15);                                         \
1783     MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
1784                              stp2_12);                                         \
1785                                                                                \
1786     stp2_16 = stp1_16;                                                         \
1787     stp2_19 = stp1_19;                                                         \
1788                                                                                \
1789     stp2_20 = stp1_20;                                                         \
1790     stp2_23 = stp1_23;                                                         \
1791                                                                                \
1792     stp2_24 = stp1_24;                                                         \
1793     stp2_27 = stp1_27;                                                         \
1794                                                                                \
1795     stp2_28 = stp1_28;                                                         \
1796     stp2_31 = stp1_31;                                                         \
1797   }                                                                            \
1798                                                                                \
1799   /* Stage3 */                                                                 \
1800   {                                                                            \
1801     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
1802     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
1803                                                                                \
1804     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
1805     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
1806     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
1807     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
1808                                                                                \
1809     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
1810     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
1811     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
1812     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
1813                                                                                \
1814     MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
1815                              stp1_7);                                          \
1816                                                                                \
1817     stp1_8 = stp2_8;                                                           \
1818     stp1_11 = stp2_11;                                                         \
1819     stp1_12 = stp2_12;                                                         \
1820     stp1_15 = stp2_15;                                                         \
1821                                                                                \
1822     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
1823                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
1824                            stp1_29)                                            \
1825     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
1826                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
1827                            stp1_25)                                            \
1828                                                                                \
1829     stp1_16 = stp2_16;                                                         \
1830     stp1_31 = stp2_31;                                                         \
1831     stp1_19 = stp2_19;                                                         \
1832     stp1_20 = stp2_20;                                                         \
1833     stp1_23 = stp2_23;                                                         \
1834     stp1_24 = stp2_24;                                                         \
1835     stp1_27 = stp2_27;                                                         \
1836     stp1_28 = stp2_28;                                                         \
1837   }                                                                            \
1838                                                                                \
1839   /* Stage4 */                                                                 \
1840   {                                                                            \
1841     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
1842     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
1843                                                                                \
1844     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
1845     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
1846     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
1847     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
1848                                                                                \
1849     MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
1850                              stp2_1);                                          \
1851                                                                                \
1852     stp2_4 = stp1_4;                                                           \
1853     stp2_5 = stp1_4;                                                           \
1854     stp2_6 = stp1_7;                                                           \
1855     stp2_7 = stp1_7;                                                           \
1856                                                                                \
1857     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
1858                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
1859                            stp2_13)                                            \
1860                                                                                \
1861     stp2_8 = stp1_8;                                                           \
1862     stp2_15 = stp1_15;                                                         \
1863     stp2_11 = stp1_11;                                                         \
1864     stp2_12 = stp1_12;                                                         \
1865                                                                                \
1866     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
1867     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
1868     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
1869     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
1870     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
1871     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
1872     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
1873     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
1874                                                                                \
1875     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
1876     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
1877     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
1878     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
1879     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
1880     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
1881     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
1882     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
1883   }                                                                            \
1884                                                                                \
1885   /* Stage5 */                                                                 \
1886   {                                                                            \
1887     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
1888     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
1889     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
1890     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
1891                                                                                \
1892     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
1893     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
1894     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
1895     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
1896                                                                                \
1897     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
1898     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
1899                                                                                \
1900     stp1_0 = stp2_0;                                                           \
1901     stp1_1 = stp2_1;                                                           \
1902     stp1_2 = stp2_1;                                                           \
1903     stp1_3 = stp2_0;                                                           \
1904                                                                                \
1905     stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
1906     stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
1907                                                                                \
1908     stp1_4 = stp2_4;                                                           \
1909     stp1_7 = stp2_7;                                                           \
1910                                                                                \
1911     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
1912     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
1913     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
1914     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
1915     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
1916     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
1917     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
1918     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
1919                                                                                \
1920     stp1_16 = stp2_16;                                                         \
1921     stp1_17 = stp2_17;                                                         \
1922                                                                                \
1923     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
1924                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
1925                            stp1_28)                                            \
1926     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
1927                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
1928                            stp1_26)                                            \
1929                                                                                \
1930     stp1_22 = stp2_22;                                                         \
1931     stp1_23 = stp2_23;                                                         \
1932     stp1_24 = stp2_24;                                                         \
1933     stp1_25 = stp2_25;                                                         \
1934     stp1_30 = stp2_30;                                                         \
1935     stp1_31 = stp2_31;                                                         \
1936   }                                                                            \
1937                                                                                \
1938   /* Stage6 */                                                                 \
1939   {                                                                            \
1940     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
1941     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
1942     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
1943     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
1944                                                                                \
1945     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
1946     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
1947     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
1948     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
1949     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
1950     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
1951     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
1952     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
1953                                                                                \
1954     stp2_8 = stp1_8;                                                           \
1955     stp2_9 = stp1_9;                                                           \
1956     stp2_14 = stp1_14;                                                         \
1957     stp2_15 = stp1_15;                                                         \
1958                                                                                \
1959     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
1960                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
1961                            stp2_12)                                            \
1962                                                                                \
1963     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
1964     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
1965     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
1966     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
1967     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
1968     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
1969     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
1970     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
1971                                                                                \
1972     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
1973     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
1974     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
1975     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
1976     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
1977     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
1978     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
1979     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
1980   }                                                                            \
1981                                                                                \
1982   /* Stage7 */                                                                 \
1983   {                                                                            \
1984     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
1985     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
1986     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
1987     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
1988                                                                                \
1989     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
1990     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
1991     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
1992     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
1993                                                                                \
1994     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
1995     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
1996     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
1997     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
1998     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
1999     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
2000     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
2001     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
2002     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
2003     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
2004     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
2005     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
2006     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
2007     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
2008     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
2009     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
2010                                                                                \
2011     stp1_16 = stp2_16;                                                         \
2012     stp1_17 = stp2_17;                                                         \
2013     stp1_18 = stp2_18;                                                         \
2014     stp1_19 = stp2_19;                                                         \
2015                                                                                \
2016     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
2017                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
2018                            stp1_26)                                            \
2019     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
2020                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
2021                            stp1_24)                                            \
2022                                                                                \
2023     stp1_28 = stp2_28;                                                         \
2024     stp1_29 = stp2_29;                                                         \
2025     stp1_30 = stp2_30;                                                         \
2026     stp1_31 = stp2_31;                                                         \
2027   }
2028
2029 #define IDCT32                                                                 \
2030   /* Stage1 */                                                                 \
2031   {                                                                            \
2032     const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
2033     const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
2034     const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
2035     const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
2036                                                                                \
2037     const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
2038     const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
2039     const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
2040     const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
2041                                                                                \
2042     const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
2043     const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
2044     const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
2045     const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
2046                                                                                \
2047     const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
2048     const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
2049     const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
2050     const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
2051                                                                                \
2052     MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
2053                            stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
2054                            stp1_30)                                            \
2055     MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
2056                            stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
2057     MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
2058                            stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
2059                            stp1_21, stp1_26)                                   \
2060     MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
2061                            stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
2062                            stp1_23, stp1_24)                                   \
2063   }                                                                            \
2064                                                                                \
2065   /* Stage2 */                                                                 \
2066   {                                                                            \
2067     const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
2068     const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
2069     const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
2070     const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
2071                                                                                \
2072     const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
2073     const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
2074     const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
2075     const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
2076                                                                                \
2077     MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
2078                            stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
2079                            stp2_14)                                            \
2080     MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
2081                            stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
2082                            stp2_12)                                            \
2083                                                                                \
2084     stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
2085     stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
2086     stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
2087     stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
2088                                                                                \
2089     stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
2090     stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
2091     stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
2092     stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
2093                                                                                \
2094     stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
2095     stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
2096     stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
2097     stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
2098                                                                                \
2099     stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
2100     stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
2101     stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
2102     stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
2103   }                                                                            \
2104                                                                                \
2105   /* Stage3 */                                                                 \
2106   {                                                                            \
2107     const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
2108     const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
2109     const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
2110     const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
2111                                                                                \
2112     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
2113     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
2114     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2115     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2116                                                                                \
2117     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2118     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2119     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2120     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2121                                                                                \
2122     MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
2123                            stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
2124                            stp1_6)                                             \
2125                                                                                \
2126     stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
2127     stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
2128     stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
2129     stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
2130     stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
2131     stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
2132     stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
2133     stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
2134                                                                                \
2135     MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
2136                            stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
2137                            stp1_29)                                            \
2138     MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
2139                            stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
2140                            stp1_25)                                            \
2141                                                                                \
2142     stp1_16 = stp2_16;                                                         \
2143     stp1_31 = stp2_31;                                                         \
2144     stp1_19 = stp2_19;                                                         \
2145     stp1_20 = stp2_20;                                                         \
2146     stp1_23 = stp2_23;                                                         \
2147     stp1_24 = stp2_24;                                                         \
2148     stp1_27 = stp2_27;                                                         \
2149     stp1_28 = stp2_28;                                                         \
2150   }                                                                            \
2151                                                                                \
2152   /* Stage4 */                                                                 \
2153   {                                                                            \
2154     const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
2155     const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
2156     const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
2157     const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
2158                                                                                \
2159     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
2160     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
2161     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2162     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2163                                                                                \
2164     MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
2165                            stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
2166                                                                                \
2167     stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
2168     stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
2169     stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
2170     stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
2171                                                                                \
2172     MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
2173                            stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
2174                            stp2_13)                                            \
2175                                                                                \
2176     stp2_8 = stp1_8;                                                           \
2177     stp2_15 = stp1_15;                                                         \
2178     stp2_11 = stp1_11;                                                         \
2179     stp2_12 = stp1_12;                                                         \
2180                                                                                \
2181     stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
2182     stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
2183     stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
2184     stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
2185     stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
2186     stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
2187     stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
2188     stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
2189                                                                                \
2190     stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
2191     stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
2192     stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
2193     stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
2194     stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
2195     stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
2196     stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
2197     stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
2198   }                                                                            \
2199                                                                                \
2200   /* Stage5 */                                                                 \
2201   {                                                                            \
2202     const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
2203     const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
2204     const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
2205     const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
2206                                                                                \
2207     const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
2208     const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
2209     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2210     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2211                                                                                \
2212     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2213     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2214                                                                                \
2215     stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
2216     stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
2217     stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
2218     stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
2219                                                                                \
2220     stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1);                   \
2221     stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0);                   \
2222                                                                                \
2223     stp1_4 = stp2_4;                                                           \
2224     stp1_7 = stp2_7;                                                           \
2225                                                                                \
2226     stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
2227     stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
2228     stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
2229     stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
2230     stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
2231     stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
2232     stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
2233     stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
2234                                                                                \
2235     stp1_16 = stp2_16;                                                         \
2236     stp1_17 = stp2_17;                                                         \
2237                                                                                \
2238     MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
2239                            stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
2240                            stp1_28)                                            \
2241     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
2242                            stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
2243                            stp1_26)                                            \
2244                                                                                \
2245     stp1_22 = stp2_22;                                                         \
2246     stp1_23 = stp2_23;                                                         \
2247     stp1_24 = stp2_24;                                                         \
2248     stp1_25 = stp2_25;                                                         \
2249     stp1_30 = stp2_30;                                                         \
2250     stp1_31 = stp2_31;                                                         \
2251   }                                                                            \
2252                                                                                \
2253   /* Stage6 */                                                                 \
2254   {                                                                            \
2255     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
2256     const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
2257     const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
2258     const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
2259                                                                                \
2260     stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
2261     stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
2262     stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
2263     stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
2264     stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
2265     stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
2266     stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
2267     stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
2268                                                                                \
2269     stp2_8 = stp1_8;                                                           \
2270     stp2_9 = stp1_9;                                                           \
2271     stp2_14 = stp1_14;                                                         \
2272     stp2_15 = stp1_15;                                                         \
2273                                                                                \
2274     MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
2275                            stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
2276                            stp2_12)                                            \
2277                                                                                \
2278     stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
2279     stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
2280     stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
2281     stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
2282     stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
2283     stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
2284     stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
2285     stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
2286                                                                                \
2287     stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
2288     stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
2289     stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
2290     stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
2291     stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
2292     stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
2293     stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
2294     stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
2295   }                                                                            \
2296                                                                                \
2297   /* Stage7 */                                                                 \
2298   {                                                                            \
2299     const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
2300     const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
2301     const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
2302     const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
2303                                                                                \
2304     const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
2305     const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
2306     const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
2307     const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
2308                                                                                \
2309     stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
2310     stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
2311     stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
2312     stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
2313     stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
2314     stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
2315     stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
2316     stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
2317     stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
2318     stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
2319     stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
2320     stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
2321     stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
2322     stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
2323     stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
2324     stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
2325                                                                                \
2326     stp1_16 = stp2_16;                                                         \
2327     stp1_17 = stp2_17;                                                         \
2328     stp1_18 = stp2_18;                                                         \
2329     stp1_19 = stp2_19;                                                         \
2330                                                                                \
2331     MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
2332                            stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
2333                            stp1_26)                                            \
2334     MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
2335                            stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
2336                            stp1_24)                                            \
2337                                                                                \
2338     stp1_28 = stp2_28;                                                         \
2339     stp1_29 = stp2_29;                                                         \
2340     stp1_30 = stp2_30;                                                         \
2341     stp1_31 = stp2_31;                                                         \
2342   }
2343
2344 // Only upper-left 8x8 has non-zero coeff
2345 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
2346                                int stride) {
2347   const __m128i zero = _mm_setzero_si128();
2348   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2349
2350   // idct constants for each stage
2351   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2352   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2353   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2354   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2355   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2356   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2357   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2358   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2359
2360   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2361   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2362   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2363   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2364
2365   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2366   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2367   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2368   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2369   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2370   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2371   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2372   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2373
2374   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2375   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2376   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2377   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2378   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2379
2380   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2381
2382   __m128i in[32], col[32];
2383   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2384       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2385       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
2386       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
2387   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2388       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2389       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
2390       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
2391   int i;
2392
2393   // Load input data. Only need to load the top left 8x8 block.
2394   in[0] = load_input_data(input);
2395   in[1] = load_input_data(input + 32);
2396   in[2] = load_input_data(input + 64);
2397   in[3] = load_input_data(input + 96);
2398   in[4] = load_input_data(input + 128);
2399   in[5] = load_input_data(input + 160);
2400   in[6] = load_input_data(input + 192);
2401   in[7] = load_input_data(input + 224);
2402
2403   array_transpose_8x8(in, in);
2404   IDCT32_34
2405
2406   // 1_D: Store 32 intermediate results for each 8x32 block.
2407   col[0] = _mm_add_epi16(stp1_0, stp1_31);
2408   col[1] = _mm_add_epi16(stp1_1, stp1_30);
2409   col[2] = _mm_add_epi16(stp1_2, stp1_29);
2410   col[3] = _mm_add_epi16(stp1_3, stp1_28);
2411   col[4] = _mm_add_epi16(stp1_4, stp1_27);
2412   col[5] = _mm_add_epi16(stp1_5, stp1_26);
2413   col[6] = _mm_add_epi16(stp1_6, stp1_25);
2414   col[7] = _mm_add_epi16(stp1_7, stp1_24);
2415   col[8] = _mm_add_epi16(stp1_8, stp1_23);
2416   col[9] = _mm_add_epi16(stp1_9, stp1_22);
2417   col[10] = _mm_add_epi16(stp1_10, stp1_21);
2418   col[11] = _mm_add_epi16(stp1_11, stp1_20);
2419   col[12] = _mm_add_epi16(stp1_12, stp1_19);
2420   col[13] = _mm_add_epi16(stp1_13, stp1_18);
2421   col[14] = _mm_add_epi16(stp1_14, stp1_17);
2422   col[15] = _mm_add_epi16(stp1_15, stp1_16);
2423   col[16] = _mm_sub_epi16(stp1_15, stp1_16);
2424   col[17] = _mm_sub_epi16(stp1_14, stp1_17);
2425   col[18] = _mm_sub_epi16(stp1_13, stp1_18);
2426   col[19] = _mm_sub_epi16(stp1_12, stp1_19);
2427   col[20] = _mm_sub_epi16(stp1_11, stp1_20);
2428   col[21] = _mm_sub_epi16(stp1_10, stp1_21);
2429   col[22] = _mm_sub_epi16(stp1_9, stp1_22);
2430   col[23] = _mm_sub_epi16(stp1_8, stp1_23);
2431   col[24] = _mm_sub_epi16(stp1_7, stp1_24);
2432   col[25] = _mm_sub_epi16(stp1_6, stp1_25);
2433   col[26] = _mm_sub_epi16(stp1_5, stp1_26);
2434   col[27] = _mm_sub_epi16(stp1_4, stp1_27);
2435   col[28] = _mm_sub_epi16(stp1_3, stp1_28);
2436   col[29] = _mm_sub_epi16(stp1_2, stp1_29);
2437   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
2438   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
2439   for (i = 0; i < 4; i++) {
2440     int j;
2441     // Transpose 32x8 block to 8x32 block
2442     array_transpose_8x8(col + i * 8, in);
2443     IDCT32_34
2444
2445     // 2_D: Calculate the results and store them to destination.
2446     in[0] = _mm_add_epi16(stp1_0, stp1_31);
2447     in[1] = _mm_add_epi16(stp1_1, stp1_30);
2448     in[2] = _mm_add_epi16(stp1_2, stp1_29);
2449     in[3] = _mm_add_epi16(stp1_3, stp1_28);
2450     in[4] = _mm_add_epi16(stp1_4, stp1_27);
2451     in[5] = _mm_add_epi16(stp1_5, stp1_26);
2452     in[6] = _mm_add_epi16(stp1_6, stp1_25);
2453     in[7] = _mm_add_epi16(stp1_7, stp1_24);
2454     in[8] = _mm_add_epi16(stp1_8, stp1_23);
2455     in[9] = _mm_add_epi16(stp1_9, stp1_22);
2456     in[10] = _mm_add_epi16(stp1_10, stp1_21);
2457     in[11] = _mm_add_epi16(stp1_11, stp1_20);
2458     in[12] = _mm_add_epi16(stp1_12, stp1_19);
2459     in[13] = _mm_add_epi16(stp1_13, stp1_18);
2460     in[14] = _mm_add_epi16(stp1_14, stp1_17);
2461     in[15] = _mm_add_epi16(stp1_15, stp1_16);
2462     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
2463     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
2464     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
2465     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
2466     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
2467     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
2468     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
2469     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
2470     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
2471     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
2472     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
2473     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
2474     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
2475     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
2476     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
2477     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
2478
2479     for (j = 0; j < 32; ++j) {
2480       // Final rounding and shift
2481       in[j] = _mm_adds_epi16(in[j], final_rounding);
2482       in[j] = _mm_srai_epi16(in[j], 6);
2483       RECON_AND_STORE(dest + j * stride, in[j]);
2484     }
2485
2486     dest += 8;
2487   }
2488 }
2489
2490 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
2491                                  int stride) {
2492   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
2493   const __m128i zero = _mm_setzero_si128();
2494
2495   // idct constants for each stage
2496   const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
2497   const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
2498   const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
2499   const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
2500   const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
2501   const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
2502   const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
2503   const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
2504   const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
2505   const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
2506   const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
2507   const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
2508   const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
2509   const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
2510   const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
2511   const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
2512
2513   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
2514   const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
2515   const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
2516   const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
2517   const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
2518   const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
2519   const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
2520   const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
2521
2522   const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
2523   const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
2524   const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
2525   const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
2526   const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
2527   const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
2528   const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
2529   const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
2530   const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
2531   const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
2532
2533   const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
2534   const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
2535   const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
2536   const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
2537   const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
2538   const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
2539   const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
2540
2541   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
2542
2543   __m128i in[32], col[128], zero_idx[16];
2544   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
2545       stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
2546       stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
2547       stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
2548   __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
2549       stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
2550       stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
2551       stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
2552   int i, j, i32;
2553
2554   for (i = 0; i < 4; i++) {
2555     i32 = (i << 5);
2556     // First 1-D idct
2557     // Load input data.
2558     LOAD_DQCOEFF(in[0], input);
2559     LOAD_DQCOEFF(in[8], input);
2560     LOAD_DQCOEFF(in[16], input);
2561     LOAD_DQCOEFF(in[24], input);
2562     LOAD_DQCOEFF(in[1], input);
2563     LOAD_DQCOEFF(in[9], input);
2564     LOAD_DQCOEFF(in[17], input);
2565     LOAD_DQCOEFF(in[25], input);
2566     LOAD_DQCOEFF(in[2], input);
2567     LOAD_DQCOEFF(in[10], input);
2568     LOAD_DQCOEFF(in[18], input);
2569     LOAD_DQCOEFF(in[26], input);
2570     LOAD_DQCOEFF(in[3], input);
2571     LOAD_DQCOEFF(in[11], input);
2572     LOAD_DQCOEFF(in[19], input);
2573     LOAD_DQCOEFF(in[27], input);
2574
2575     LOAD_DQCOEFF(in[4], input);
2576     LOAD_DQCOEFF(in[12], input);
2577     LOAD_DQCOEFF(in[20], input);
2578     LOAD_DQCOEFF(in[28], input);
2579     LOAD_DQCOEFF(in[5], input);
2580     LOAD_DQCOEFF(in[13], input);
2581     LOAD_DQCOEFF(in[21], input);
2582     LOAD_DQCOEFF(in[29], input);
2583     LOAD_DQCOEFF(in[6], input);
2584     LOAD_DQCOEFF(in[14], input);
2585     LOAD_DQCOEFF(in[22], input);
2586     LOAD_DQCOEFF(in[30], input);
2587     LOAD_DQCOEFF(in[7], input);
2588     LOAD_DQCOEFF(in[15], input);
2589     LOAD_DQCOEFF(in[23], input);
2590     LOAD_DQCOEFF(in[31], input);
2591
2592     // checking if all entries are zero
2593     zero_idx[0] = _mm_or_si128(in[0], in[1]);
2594     zero_idx[1] = _mm_or_si128(in[2], in[3]);
2595     zero_idx[2] = _mm_or_si128(in[4], in[5]);
2596     zero_idx[3] = _mm_or_si128(in[6], in[7]);
2597     zero_idx[4] = _mm_or_si128(in[8], in[9]);
2598     zero_idx[5] = _mm_or_si128(in[10], in[11]);
2599     zero_idx[6] = _mm_or_si128(in[12], in[13]);
2600     zero_idx[7] = _mm_or_si128(in[14], in[15]);
2601     zero_idx[8] = _mm_or_si128(in[16], in[17]);
2602     zero_idx[9] = _mm_or_si128(in[18], in[19]);
2603     zero_idx[10] = _mm_or_si128(in[20], in[21]);
2604     zero_idx[11] = _mm_or_si128(in[22], in[23]);
2605     zero_idx[12] = _mm_or_si128(in[24], in[25]);
2606     zero_idx[13] = _mm_or_si128(in[26], in[27]);
2607     zero_idx[14] = _mm_or_si128(in[28], in[29]);
2608     zero_idx[15] = _mm_or_si128(in[30], in[31]);
2609
2610     zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
2611     zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
2612     zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
2613     zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
2614     zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
2615     zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
2616     zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
2617     zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
2618
2619     zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
2620     zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
2621     zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
2622     zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
2623     zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
2624     zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
2625     zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
2626
2627     if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
2628       col[i32 + 0] = _mm_setzero_si128();
2629       col[i32 + 1] = _mm_setzero_si128();
2630       col[i32 + 2] = _mm_setzero_si128();
2631       col[i32 + 3] = _mm_setzero_si128();
2632       col[i32 + 4] = _mm_setzero_si128();
2633       col[i32 + 5] = _mm_setzero_si128();
2634       col[i32 + 6] = _mm_setzero_si128();
2635       col[i32 + 7] = _mm_setzero_si128();
2636       col[i32 + 8] = _mm_setzero_si128();
2637       col[i32 + 9] = _mm_setzero_si128();
2638       col[i32 + 10] = _mm_setzero_si128();
2639       col[i32 + 11] = _mm_setzero_si128();
2640       col[i32 + 12] = _mm_setzero_si128();
2641       col[i32 + 13] = _mm_setzero_si128();
2642       col[i32 + 14] = _mm_setzero_si128();
2643       col[i32 + 15] = _mm_setzero_si128();
2644       col[i32 + 16] = _mm_setzero_si128();
2645       col[i32 + 17] = _mm_setzero_si128();
2646       col[i32 + 18] = _mm_setzero_si128();
2647       col[i32 + 19] = _mm_setzero_si128();
2648       col[i32 + 20] = _mm_setzero_si128();
2649       col[i32 + 21] = _mm_setzero_si128();
2650       col[i32 + 22] = _mm_setzero_si128();
2651       col[i32 + 23] = _mm_setzero_si128();
2652       col[i32 + 24] = _mm_setzero_si128();
2653       col[i32 + 25] = _mm_setzero_si128();
2654       col[i32 + 26] = _mm_setzero_si128();
2655       col[i32 + 27] = _mm_setzero_si128();
2656       col[i32 + 28] = _mm_setzero_si128();
2657       col[i32 + 29] = _mm_setzero_si128();
2658       col[i32 + 30] = _mm_setzero_si128();
2659       col[i32 + 31] = _mm_setzero_si128();
2660       continue;
2661     }
2662
2663     // Transpose 32x8 block to 8x32 block
2664     array_transpose_8x8(in, in);
2665     array_transpose_8x8(in + 8, in + 8);
2666     array_transpose_8x8(in + 16, in + 16);
2667     array_transpose_8x8(in + 24, in + 24);
2668
2669     IDCT32
2670
2671     // 1_D: Store 32 intermediate results for each 8x32 block.
2672     col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
2673     col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
2674     col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
2675     col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
2676     col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
2677     col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
2678     col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
2679     col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
2680     col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
2681     col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
2682     col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
2683     col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
2684     col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
2685     col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
2686     col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
2687     col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
2688     col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
2689     col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
2690     col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
2691     col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
2692     col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
2693     col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
2694     col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
2695     col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
2696     col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
2697     col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
2698     col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
2699     col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
2700     col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
2701     col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
2702     col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
2703     col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
2704   }
2705   for (i = 0; i < 4; i++) {
2706     // Second 1-D idct
2707     j = i << 3;
2708
2709     // Transpose 32x8 block to 8x32 block
2710     array_transpose_8x8(col + j, in);
2711     array_transpose_8x8(col + j + 32, in + 8);
2712     array_transpose_8x8(col + j + 64, in + 16);
2713     array_transpose_8x8(col + j + 96, in + 24);
2714
2715     IDCT32
2716
2717     // 2_D: Calculate the results and store them to destination.
2718     in[0] = _mm_add_epi16(stp1_0, stp1_31);
2719     in[1] = _mm_add_epi16(stp1_1, stp1_30);
2720     in[2] = _mm_add_epi16(stp1_2, stp1_29);
2721     in[3] = _mm_add_epi16(stp1_3, stp1_28);
2722     in[4] = _mm_add_epi16(stp1_4, stp1_27);
2723     in[5] = _mm_add_epi16(stp1_5, stp1_26);
2724     in[6] = _mm_add_epi16(stp1_6, stp1_25);
2725     in[7] = _mm_add_epi16(stp1_7, stp1_24);
2726     in[8] = _mm_add_epi16(stp1_8, stp1_23);
2727     in[9] = _mm_add_epi16(stp1_9, stp1_22);
2728     in[10] = _mm_add_epi16(stp1_10, stp1_21);
2729     in[11] = _mm_add_epi16(stp1_11, stp1_20);
2730     in[12] = _mm_add_epi16(stp1_12, stp1_19);
2731     in[13] = _mm_add_epi16(stp1_13, stp1_18);
2732     in[14] = _mm_add_epi16(stp1_14, stp1_17);
2733     in[15] = _mm_add_epi16(stp1_15, stp1_16);
2734     in[16] = _mm_sub_epi16(stp1_15, stp1_16);
2735     in[17] = _mm_sub_epi16(stp1_14, stp1_17);
2736     in[18] = _mm_sub_epi16(stp1_13, stp1_18);
2737     in[19] = _mm_sub_epi16(stp1_12, stp1_19);
2738     in[20] = _mm_sub_epi16(stp1_11, stp1_20);
2739     in[21] = _mm_sub_epi16(stp1_10, stp1_21);
2740     in[22] = _mm_sub_epi16(stp1_9, stp1_22);
2741     in[23] = _mm_sub_epi16(stp1_8, stp1_23);
2742     in[24] = _mm_sub_epi16(stp1_7, stp1_24);
2743     in[25] = _mm_sub_epi16(stp1_6, stp1_25);
2744     in[26] = _mm_sub_epi16(stp1_5, stp1_26);
2745     in[27] = _mm_sub_epi16(stp1_4, stp1_27);
2746     in[28] = _mm_sub_epi16(stp1_3, stp1_28);
2747     in[29] = _mm_sub_epi16(stp1_2, stp1_29);
2748     in[30] = _mm_sub_epi16(stp1_1, stp1_30);
2749     in[31] = _mm_sub_epi16(stp1_0, stp1_31);
2750
2751     for (j = 0; j < 32; ++j) {
2752       // Final rounding and shift
2753       in[j] = _mm_adds_epi16(in[j], final_rounding);
2754       in[j] = _mm_srai_epi16(in[j], 6);
2755       RECON_AND_STORE(dest + j * stride, in[j]);
2756     }
2757
2758     dest += 8;
2759   }
2760 }
2761
2762 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
2763                               int stride) {
2764   __m128i dc_value;
2765   const __m128i zero = _mm_setzero_si128();
2766   int a, j;
2767
2768   a = (int)dct_const_round_shift(input[0] * cospi_16_64);
2769   a = (int)dct_const_round_shift(a * cospi_16_64);
2770   a = ROUND_POWER_OF_TWO(a, 6);
2771
2772   dc_value = _mm_set1_epi16(a);
2773
2774   for (j = 0; j < 32; ++j) {
2775     RECON_AND_STORE(dest + 0 + j * stride, dc_value);
2776     RECON_AND_STORE(dest + 8 + j * stride, dc_value);
2777     RECON_AND_STORE(dest + 16 + j * stride, dc_value);
2778     RECON_AND_STORE(dest + 24 + j * stride, dc_value);
2779   }
2780 }