-#include "vp10/common/x86/vp10_txfm1d_sse2.h"
+#include "vp10/common/x86/vp10_txfm1d_sse4.h"
-void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 4;
const int num_per_128 = 4;
const int32_t* cospi;
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+ buf0[1], bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
// stage 3
stage_idx++;
}
}
-void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 8;
const int num_per_128 = 4;
const int32_t* cospi;
buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
buf0[4] = buf1[4];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
- bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+ buf0[6], bit);
buf0[7] = buf1[7];
// stage 3
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
- bit);
- btf_32_sse2_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+ buf1[1], bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+ buf1[3], bit);
buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
- btf_32_sse2_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
- bit);
- btf_32_sse2_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
- bit);
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+ bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+ buf0[6], bit);
// stage 5
stage_idx++;
}
}
-void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 16;
const int num_per_128 = 4;
const int32_t* cospi;
buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
buf0[8] = buf1[8];
buf0[9] = buf1[9];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+ buf0[12], bit);
buf0[14] = buf1[14];
buf0[15] = buf1[15];
buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
buf1[4] = buf0[4];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], buf1[6],
- bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+ buf1[6], bit);
buf1[7] = buf0[7];
buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+ buf0[1], bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
buf0[8] = buf1[8];
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+ buf0[14], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+ buf0[13], bit);
buf0[11] = buf1[11];
buf0[12] = buf1[12];
buf0[15] = buf1[15];
buf1[1] = buf0[1];
buf1[2] = buf0[2];
buf1[3] = buf0[3];
- btf_32_sse2_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
- bit);
- btf_32_sse2_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], buf1[6],
- bit);
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+ bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+ buf1[6], bit);
buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
buf0[5] = buf1[5];
buf0[6] = buf1[6];
buf0[7] = buf1[7];
- btf_32_sse2_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], buf0[15],
- bit);
- btf_32_sse2_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse2_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse2_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+ buf0[15], bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+ buf0[14], bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+ buf0[13], bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+ buf0[12], bit);
// stage 7
stage_idx++;
}
}
-void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 32;
const int num_per_128 = 4;
const int32_t* cospi;
buf0[17] = buf1[17];
buf0[18] = buf1[18];
buf0[19] = buf1[19];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
- buf0[27], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
- buf0[25], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
- buf0[24], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], bit);
buf0[28] = buf1[28];
buf0[29] = buf1[29];
buf0[30] = buf1[30];
buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
buf1[8] = buf0[8];
buf1[9] = buf0[9];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
- buf1[13], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
- buf1[12], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], bit);
buf1[14] = buf0[14];
buf1[15] = buf0[15];
buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
buf0[4] = buf1[4];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
- bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+ buf0[6], bit);
buf0[7] = buf1[7];
buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
- buf0[29], bit);
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
- buf0[28], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
- buf0[27], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
buf0[22] = buf1[22];
buf0[23] = buf1[23];
buf0[24] = buf1[24];
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
- bit);
- btf_32_sse2_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+ buf1[1], bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+ buf1[3], bit);
buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
buf1[8] = buf0[8];
- btf_32_sse2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
- buf1[14], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
- buf1[13], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+ buf1[14], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], bit);
buf1[11] = buf0[11];
buf1[12] = buf0[12];
buf1[15] = buf0[15];
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
- btf_32_sse2_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
- bit);
- btf_32_sse2_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
- bit);
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+ bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+ buf0[6], bit);
buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
buf0[16] = buf1[16];
- btf_32_sse2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
- buf0[30], bit);
- btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
- buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], bit);
buf0[19] = buf1[19];
buf0[20] = buf1[20];
- btf_32_sse2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
- btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
- buf0[25], bit);
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], bit);
buf0[23] = buf1[23];
buf0[24] = buf1[24];
buf0[27] = buf1[27];
buf1[5] = buf0[5];
buf1[6] = buf0[6];
buf1[7] = buf0[7];
- btf_32_sse2_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
- bit);
- btf_32_sse2_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
- buf1[14], bit);
- btf_32_sse2_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
- buf1[13], bit);
- btf_32_sse2_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
- buf1[12], bit);
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+ buf1[15], bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+ buf1[14], bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+ buf1[13], bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+ buf1[12], bit);
buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
buf0[13] = buf1[13];
buf0[14] = buf1[14];
buf0[15] = buf1[15];
- btf_32_sse2_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
- buf0[31], bit);
- btf_32_sse2_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
- buf0[30], bit);
- btf_32_sse2_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
- buf0[29], bit);
- btf_32_sse2_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
- buf0[28], bit);
- btf_32_sse2_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
- buf0[27], bit);
- btf_32_sse2_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
- buf0[26], bit);
- btf_32_sse2_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
- buf0[25], bit);
- btf_32_sse2_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
- buf0[24], bit);
+ btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+ buf0[31], bit);
+ btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+ buf0[30], bit);
+ btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+ buf0[29], bit);
+ btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+ buf0[28], bit);
+ btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+ buf0[27], bit);
+ btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+ buf0[26], bit);
+ btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+ buf0[25], bit);
+ btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+ buf0[24], bit);
// stage 9
stage_idx++;
}
}
-void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 4;
const int num_per_128 = 4;
const int32_t* cospi;
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse2_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+ bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
// stage 3
stage_idx++;
cospi = cospi_arr[bit - cos_bit_min];
buf0[0] = buf1[0];
buf0[1] = buf1[1];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
// stage 5
stage_idx++;
}
}
-void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 8;
const int num_per_128 = 4;
const int32_t* cospi;
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse2_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
- btf_32_sse2_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4], buf0[5],
- bit);
- btf_32_sse2_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
+ btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
+ bit);
+ btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
+ btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4],
+ buf0[5], bit);
+ btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
// stage 3
stage_idx++;
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
- btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5],
- bit);
- btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+ buf0[5], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
// stage 5
stage_idx++;
cospi = cospi_arr[bit - cos_bit_min];
buf0[0] = buf1[0];
buf0[1] = buf1[1];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
buf0[4] = buf1[4];
buf0[5] = buf1[5];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
// stage 7
stage_idx++;
}
}
-void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 16;
const int num_per_128 = 4;
const int32_t* cospi;
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse2_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
- btf_32_sse2_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4], buf0[5],
- bit);
- btf_32_sse2_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
- btf_32_sse2_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8], buf0[9],
- bit);
- btf_32_sse2_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse2_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse2_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
+ bit);
+ btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
+ btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4],
+ buf0[5], bit);
+ btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
+ btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8],
+ buf0[9], bit);
+ btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
+ btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
// stage 3
stage_idx++;
buf0[5] = buf1[5];
buf0[6] = buf1[6];
buf0[7] = buf1[7];
- btf_32_sse2_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
- bit);
- btf_32_sse2_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse2_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse2_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+ bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
+ btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
// stage 5
stage_idx++;
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
- btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5],
- bit);
- btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+ buf0[5], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
buf0[8] = buf1[8];
buf0[9] = buf1[9];
buf0[10] = buf1[10];
buf0[11] = buf1[11];
- btf_32_sse2_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse2_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
// stage 7
stage_idx++;
cospi = cospi_arr[bit - cos_bit_min];
buf0[0] = buf1[0];
buf0[1] = buf1[1];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
buf0[4] = buf1[4];
buf0[5] = buf1[5];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
buf0[8] = buf1[8];
buf0[9] = buf1[9];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
buf0[12] = buf1[12];
buf0[13] = buf1[13];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
// stage 9
stage_idx++;
}
}
-void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 32;
const int num_per_128 = 4;
const int32_t* cospi;
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse2_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
- btf_32_sse2_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
- bit);
- btf_32_sse2_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
- btf_32_sse2_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8], buf0[9],
- bit);
- btf_32_sse2_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse2_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse2_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
- btf_32_sse2_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
- buf0[17], bit);
- btf_32_sse2_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
- buf0[19], bit);
- btf_32_sse2_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
- buf0[21], bit);
- btf_32_sse2_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
- btf_32_sse2_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
- buf0[25], bit);
- btf_32_sse2_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
- btf_32_sse2_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse2_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
+ btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+ bit);
+ btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+ bit);
+ btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+ bit);
+ btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
+ btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+ buf0[9], bit);
+ btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
+ btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+ buf0[17], bit);
+ btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+ buf0[19], bit);
+ btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+ buf0[21], bit);
+ btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
+ btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
// stage 3
stage_idx++;
buf0[13] = buf1[13];
buf0[14] = buf1[14];
buf0[15] = buf1[15];
- btf_32_sse2_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
- buf0[17], bit);
- btf_32_sse2_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
- buf0[19], bit);
- btf_32_sse2_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
- buf0[21], bit);
- btf_32_sse2_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
- btf_32_sse2_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
- buf0[25], bit);
- btf_32_sse2_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
- btf_32_sse2_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse2_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
+ btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+ buf0[17], bit);
+ btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+ buf0[19], bit);
+ btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+ buf0[21], bit);
+ btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
+ btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
// stage 5
stage_idx++;
buf0[5] = buf1[5];
buf0[6] = buf1[6];
buf0[7] = buf1[7];
- btf_32_sse2_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
- bit);
- btf_32_sse2_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
- btf_32_sse2_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse2_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+ bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
+ btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
buf0[18] = buf1[18];
buf0[21] = buf1[21];
buf0[22] = buf1[22];
buf0[23] = buf1[23];
- btf_32_sse2_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
- buf0[25], bit);
- btf_32_sse2_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
- btf_32_sse2_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse2_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+ buf0[25], bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
+ btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
// stage 7
stage_idx++;
buf0[1] = buf1[1];
buf0[2] = buf1[2];
buf0[3] = buf1[3];
- btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5],
- bit);
- btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+ buf0[5], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
buf0[8] = buf1[8];
buf0[9] = buf1[9];
buf0[10] = buf1[10];
buf0[11] = buf1[11];
- btf_32_sse2_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
- buf0[13], bit);
- btf_32_sse2_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
buf0[18] = buf1[18];
buf0[19] = buf1[19];
- btf_32_sse2_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
- buf0[21], bit);
- btf_32_sse2_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+ buf0[21], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
buf0[24] = buf1[24];
buf0[25] = buf1[25];
buf0[26] = buf1[26];
buf0[27] = buf1[27];
- btf_32_sse2_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
- buf0[29], bit);
- btf_32_sse2_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
+ btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+ buf0[29], bit);
+ btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
// stage 9
stage_idx++;
cospi = cospi_arr[bit - cos_bit_min];
buf0[0] = buf1[0];
buf0[1] = buf1[1];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
buf0[4] = buf1[4];
buf0[5] = buf1[5];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+ buf0[7], bit);
buf0[8] = buf1[8];
buf0[9] = buf1[9];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
- buf0[11], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+ buf0[11], bit);
buf0[12] = buf1[12];
buf0[13] = buf1[13];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
- buf0[15], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+ buf0[15], bit);
buf0[16] = buf1[16];
buf0[17] = buf1[17];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
- buf0[19], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+ buf0[19], bit);
buf0[20] = buf1[20];
buf0[21] = buf1[21];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
- buf0[23], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+ buf0[23], bit);
buf0[24] = buf1[24];
buf0[25] = buf1[25];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
- buf0[27], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+ buf0[27], bit);
buf0[28] = buf1[28];
buf0[29] = buf1[29];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
- buf0[31], bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+ buf0[31], bit);
// stage 11
stage_idx++;
}
}
-void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
- const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output,
+ const int8_t* cos_bit, const int8_t* stage_range) {
const int txfm_size = 64;
const int num_per_128 = 4;
const int32_t* cospi;
buf0[37] = buf1[37];
buf0[38] = buf1[38];
buf0[39] = buf1[39];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40],
- buf0[55], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41],
- buf0[54], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42],
- buf0[53], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43],
- buf0[52], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44],
- buf0[51], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45],
- buf0[50], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46],
- buf0[49], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47],
- buf0[48], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40],
+ buf0[55], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41],
+ buf0[54], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42],
+ buf0[53], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43],
+ buf0[52], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44],
+ buf0[51], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45],
+ buf0[50], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46],
+ buf0[49], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47],
+ buf0[48], bit);
buf0[56] = buf1[56];
buf0[57] = buf1[57];
buf0[58] = buf1[58];
buf1[17] = buf0[17];
buf1[18] = buf0[18];
buf1[19] = buf0[19];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20],
- buf1[27], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21],
- buf1[26], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22],
- buf1[25], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23],
- buf1[24], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20],
+ buf1[27], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21],
+ buf1[26], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22],
+ buf1[25], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23],
+ buf1[24], bit);
buf1[28] = buf0[28];
buf1[29] = buf0[29];
buf1[30] = buf0[30];
buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
buf0[8] = buf1[8];
buf0[9] = buf1[9];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse2_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+ buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+ buf0[12], bit);
buf0[14] = buf1[14];
buf0[15] = buf1[15];
buf0[16] = _mm_add_epi32(buf1[16], buf1[23]);
buf0[33] = buf1[33];
buf0[34] = buf1[34];
buf0[35] = buf1[35];
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36],
- buf0[59], bit);
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37],
- buf0[58], bit);
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38],
- buf0[57], bit);
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39],
- buf0[56], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40],
- buf0[55], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41],
- buf0[54], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42],
- buf0[53], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43],
- buf0[52], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36],
+ buf0[59], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37],
+ buf0[58], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38],
+ buf0[57], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39],
+ buf0[56], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40],
+ buf0[55], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41],
+ buf0[54], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42],
+ buf0[53], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43],
+ buf0[52], bit);
buf0[44] = buf1[44];
buf0[45] = buf1[45];
buf0[46] = buf1[46];
buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
buf1[4] = buf0[4];
- btf_32_sse2_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], buf1[6],
- bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+ buf1[6], bit);
buf1[7] = buf0[7];
buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
buf1[16] = buf0[16];
buf1[17] = buf0[17];
- btf_32_sse2_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18],
- buf1[29], bit);
- btf_32_sse2_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19],
- buf1[28], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20],
- buf1[27], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21],
- buf1[26], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18],
+ buf1[29], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19],
+ buf1[28], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20],
+ buf1[27], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21],
+ buf1[26], bit);
buf1[22] = buf0[22];
buf1[23] = buf0[23];
buf1[24] = buf0[24];
stage_idx++;
bit = cos_bit[stage_idx];
cospi = cospi_arr[bit - cos_bit_min];
- btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1],
- bit);
- btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3],
- bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+ buf0[1], bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+ buf0[3], bit);
buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
buf0[8] = buf1[8];
- btf_32_sse2_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+ buf0[14], bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+ buf0[13], bit);
buf0[11] = buf1[11];
buf0[12] = buf1[12];
buf0[15] = buf1[15];
buf0[30] = _mm_add_epi32(buf1[30], buf1[29]);
buf0[32] = buf1[32];
buf0[33] = buf1[33];
- btf_32_sse2_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34],
- buf0[61], bit);
- btf_32_sse2_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35],
- buf0[60], bit);
- btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36],
- buf0[59], bit);
- btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37],
- buf0[58], bit);
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34],
+ buf0[61], bit);
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35],
+ buf0[60], bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36],
+ buf0[59], bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37],
+ buf0[58], bit);
buf0[38] = buf1[38];
buf0[39] = buf1[39];
buf0[40] = buf1[40];
buf0[41] = buf1[41];
- btf_32_sse2_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42],
- buf0[53], bit);
- btf_32_sse2_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43],
- buf0[52], bit);
- btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44],
- buf0[51], bit);
- btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45],
- buf0[50], bit);
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42],
+ buf0[53], bit);
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43],
+ buf0[52], bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44],
+ buf0[51], bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45],
+ buf0[50], bit);
buf0[46] = buf1[46];
buf0[47] = buf1[47];
buf0[48] = buf1[48];
buf1[1] = buf0[1];
buf1[2] = buf0[2];
buf1[3] = buf0[3];
- btf_32_sse2_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
- bit);
- btf_32_sse2_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], buf1[6],
- bit);
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+ bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+ buf1[6], bit);
buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
buf1[16] = buf0[16];
- btf_32_sse2_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17],
- buf1[30], bit);
- btf_32_sse2_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18],
- buf1[29], bit);
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17],
+ buf1[30], bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18],
+ buf1[29], bit);
buf1[19] = buf0[19];
buf1[20] = buf0[20];
- btf_32_sse2_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21],
- buf1[26], bit);
- btf_32_sse2_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22],
- buf1[25], bit);
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21],
+ buf1[26], bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22],
+ buf1[25], bit);
buf1[23] = buf0[23];
buf1[24] = buf0[24];
buf1[27] = buf0[27];
buf0[5] = buf1[5];
buf0[6] = buf1[6];
buf0[7] = buf1[7];
- btf_32_sse2_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], buf0[15],
- bit);
- btf_32_sse2_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
- buf0[14], bit);
- btf_32_sse2_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
- buf0[13], bit);
- btf_32_sse2_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
- buf0[12], bit);
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+ buf0[15], bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+ buf0[14], bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+ buf0[13], bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+ buf0[12], bit);
buf0[16] = _mm_add_epi32(buf1[16], buf1[17]);
buf0[17] = _mm_sub_epi32(buf1[16], buf1[17]);
buf0[18] = _mm_sub_epi32(buf1[19], buf1[18]);
buf0[30] = _mm_sub_epi32(buf1[31], buf1[30]);
buf0[31] = _mm_add_epi32(buf1[31], buf1[30]);
buf0[32] = buf1[32];
- btf_32_sse2_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33],
- buf0[62], bit);
- btf_32_sse2_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34],
- buf0[61], bit);
+ btf_32_sse4_1_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33],
+ buf0[62], bit);
+ btf_32_sse4_1_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34],
+ buf0[61], bit);
buf0[35] = buf1[35];
buf0[36] = buf1[36];
- btf_32_sse2_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37],
- buf0[58], bit);
- btf_32_sse2_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38],
- buf0[57], bit);
+ btf_32_sse4_1_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37],
+ buf0[58], bit);
+ btf_32_sse4_1_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38],
+ buf0[57], bit);
buf0[39] = buf1[39];
buf0[40] = buf1[40];
- btf_32_sse2_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41],
- buf0[54], bit);
- btf_32_sse2_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42],
- buf0[53], bit);
+ btf_32_sse4_1_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41],
+ buf0[54], bit);
+ btf_32_sse4_1_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42],
+ buf0[53], bit);
buf0[43] = buf1[43];
buf0[44] = buf1[44];
- btf_32_sse2_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45],
- buf0[50], bit);
- btf_32_sse2_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46],
- buf0[49], bit);
+ btf_32_sse4_1_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45],
+ buf0[50], bit);
+ btf_32_sse4_1_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46],
+ buf0[49], bit);
buf0[47] = buf1[47];
buf0[48] = buf1[48];
buf0[51] = buf1[51];
buf1[13] = buf0[13];
buf1[14] = buf0[14];
buf1[15] = buf0[15];
- btf_32_sse2_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16],
- buf1[31], bit);
- btf_32_sse2_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17],
- buf1[30], bit);
- btf_32_sse2_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18],
- buf1[29], bit);
- btf_32_sse2_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19],
- buf1[28], bit);
- btf_32_sse2_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20],
- buf1[27], bit);
- btf_32_sse2_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21],
- buf1[26], bit);
- btf_32_sse2_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22],
- buf1[25], bit);
- btf_32_sse2_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23],
- buf1[24], bit);
+ btf_32_sse4_1_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16],
+ buf1[31], bit);
+ btf_32_sse4_1_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17],
+ buf1[30], bit);
+ btf_32_sse4_1_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18],
+ buf1[29], bit);
+ btf_32_sse4_1_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19],
+ buf1[28], bit);
+ btf_32_sse4_1_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20],
+ buf1[27], bit);
+ btf_32_sse4_1_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21],
+ buf1[26], bit);
+ btf_32_sse4_1_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22],
+ buf1[25], bit);
+ btf_32_sse4_1_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23],
+ buf1[24], bit);
buf1[32] = _mm_add_epi32(buf0[32], buf0[33]);
buf1[33] = _mm_sub_epi32(buf0[32], buf0[33]);
buf1[34] = _mm_sub_epi32(buf0[35], buf0[34]);
buf0[29] = buf1[29];
buf0[30] = buf1[30];
buf0[31] = buf1[31];
- btf_32_sse2_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32],
- buf0[63], bit);
- btf_32_sse2_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33],
- buf0[62], bit);
- btf_32_sse2_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34],
- buf0[61], bit);
- btf_32_sse2_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35],
- buf0[60], bit);
- btf_32_sse2_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36],
- buf0[59], bit);
- btf_32_sse2_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37],
- buf0[58], bit);
- btf_32_sse2_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38],
- buf0[57], bit);
- btf_32_sse2_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39],
- buf0[56], bit);
- btf_32_sse2_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40],
- buf0[55], bit);
- btf_32_sse2_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41],
- buf0[54], bit);
- btf_32_sse2_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42],
- buf0[53], bit);
- btf_32_sse2_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43],
- buf0[52], bit);
- btf_32_sse2_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44],
- buf0[51], bit);
- btf_32_sse2_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45],
- buf0[50], bit);
- btf_32_sse2_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46],
- buf0[49], bit);
- btf_32_sse2_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47],
- buf0[48], bit);
+ btf_32_sse4_1_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32],
+ buf0[63], bit);
+ btf_32_sse4_1_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33],
+ buf0[62], bit);
+ btf_32_sse4_1_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34],
+ buf0[61], bit);
+ btf_32_sse4_1_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35],
+ buf0[60], bit);
+ btf_32_sse4_1_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36],
+ buf0[59], bit);
+ btf_32_sse4_1_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37],
+ buf0[58], bit);
+ btf_32_sse4_1_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38],
+ buf0[57], bit);
+ btf_32_sse4_1_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39],
+ buf0[56], bit);
+ btf_32_sse4_1_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40],
+ buf0[55], bit);
+ btf_32_sse4_1_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41],
+ buf0[54], bit);
+ btf_32_sse4_1_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42],
+ buf0[53], bit);
+ btf_32_sse4_1_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43],
+ buf0[52], bit);
+ btf_32_sse4_1_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44],
+ buf0[51], bit);
+ btf_32_sse4_1_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45],
+ buf0[50], bit);
+ btf_32_sse4_1_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46],
+ buf0[49], bit);
+ btf_32_sse4_1_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47],
+ buf0[48], bit);
// stage 11
stage_idx++;