]> granicus.if.org Git - libvpx/commitdiff
change vp10_fwd_txfm2d_#x#_sse2 to vp10_fwd_txfm2d_#x#_sse4_1
authorAngie Chiang <angiebird@google.com>
Thu, 24 Mar 2016 22:34:27 +0000 (15:34 -0700)
committerAngie Chiang <angiebird@google.com>
Wed, 30 Mar 2016 22:25:26 +0000 (15:25 -0700)
The speed performance for running 20k times  is as follows

Notice that the vp10_highbd_fdct#x#_sse2 version is
16-bit version plus range check

The rest are 32-bit version

vp10_fwd_txfm2d_4x4_c (2 ms)
vp10_fwd_txfm2d_8x8_c (9 ms)
vp10_fwd_txfm2d_16x16_c (45 ms)
vp10_fwd_txfm2d_32x32_c (233 ms)

vp10_fwd_txfm2d_4x4_sse4_1 (2 ms)
vp10_fwd_txfm2d_8x8_sse4_1 (3 ms)
vp10_fwd_txfm2d_16x16_sse4_1 (16 ms)
vp10_fwd_txfm2d_32x32_sse4_1 (80 ms)

vp10_highbd_fdct4x4_c (1 ms)
vp10_highbd_fdct8x8_c (3 ms)
vp10_highbd_fdct16x16_c (17 ms)
highbd_fdct32x32_c (160 ms)

vp10_highbd_fdct4x4_sse2 (0 ms)
vp10_highbd_fdct8x8_sse2 (2 ms)
vp10_highbd_fdct16x16_sse2 (8 ms)
highbd_fdct32x32_sse2 (105 ms)

Change-Id: I24daf1e0d4d66e91e4ce61ef71cefa7b70ee90ce

test/test.mk
test/vp10_fwd_txfm2d_sse4_test.cc [moved from test/vp10_fwd_txfm2d_sse2_test.cc with 76% similarity]
vp10/common/vp10_rtcd_defs.pl
vp10/common/x86/vp10_fwd_txfm1d_sse4.c [moved from vp10/common/x86/vp10_fwd_txfm1d_sse2.c with 74% similarity]
vp10/common/x86/vp10_fwd_txfm2d_sse4.c [moved from vp10/common/x86/vp10_fwd_txfm2d_sse2.c with 68% similarity]
vp10/common/x86/vp10_txfm1d_sse2.h [deleted file]
vp10/common/x86/vp10_txfm1d_sse4.h [new file with mode: 0644]
vp10/vp10_common.mk

index d466b4712f97d55c626103f6d6c51088eafd7c74..7c3f10142c07016d76e95185092da1ab380b9dc6 100644 (file)
@@ -174,7 +174,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ANS)          += vp10_ans_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
 
-LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp10_fwd_txfm2d_sse2_test.cc
+LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_fwd_txfm2d_sse4_test.cc
 
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
similarity index 76%
rename from test/vp10_fwd_txfm2d_sse2_test.cc
rename to test/vp10_fwd_txfm2d_sse4_test.cc
index f5cc15944d7698c073296c6dfabeb7810c9e09b3..d3882cd9e713130c6c18d501822fdd2aa5acca0b 100644 (file)
@@ -12,9 +12,9 @@ using libvpx_test::ACMRandom;
 namespace {
 
 #if CONFIG_VP9_HIGHBITDEPTH
-TEST(vp10_fwd_txfm2d_sse2, accuracy) {
+TEST(vp10_fwd_txfm2d_sse4_1, accuracy) {
   int16_t input[4096] = {0};
-  int32_t output_sse2[4096] = {0};
+  int32_t output_sse4_1[4096] = {0};
   int32_t output_c[4096] = {0};
 
   int txfm_num = 17;
@@ -36,10 +36,10 @@ TEST(vp10_fwd_txfm2d_sse2, accuracy) {
       vp10_fwd_txfm2d_32x32_c, vp10_fwd_txfm2d_64x64_c,
   };
 
-  Fwd_Txfm2d_Func txfm2d_func_sse2_list[] = {
-      vp10_fwd_txfm2d_4x4_sse2,   vp10_fwd_txfm2d_8x8_sse2,
-      vp10_fwd_txfm2d_16x16_sse2, vp10_fwd_txfm2d_32x32_sse2,
-      vp10_fwd_txfm2d_64x64_sse2,
+  Fwd_Txfm2d_Func txfm2d_func_sse4_1_list[] = {
+      vp10_fwd_txfm2d_4x4_sse4_1,   vp10_fwd_txfm2d_8x8_sse4_1,
+      vp10_fwd_txfm2d_16x16_sse4_1, vp10_fwd_txfm2d_32x32_sse4_1,
+      vp10_fwd_txfm2d_64x64_sse4_1,
   };
 
   for (int i = 0; i < txfm_num; i++) {
@@ -47,7 +47,7 @@ TEST(vp10_fwd_txfm2d_sse2, accuracy) {
     int txfm_size = cfg.txfm_size;
     int func_idx = get_max_bit(txfm_size) - 2;
     Fwd_Txfm2d_Func txfm2d_func_c = txfm2d_func_c_list[func_idx];
-    Fwd_Txfm2d_Func txfm2d_func_sse2 = txfm2d_func_sse2_list[func_idx];
+    Fwd_Txfm2d_Func txfm2d_func_sse4_1 = txfm2d_func_sse4_1_list[func_idx];
 
     ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -59,10 +59,11 @@ TEST(vp10_fwd_txfm2d_sse2, accuracy) {
     }
 
     txfm2d_func_c(input, output_c, cfg.txfm_size, &cfg, 10);
-    txfm2d_func_sse2(input, output_sse2, cfg.txfm_size, &cfg, 10);
+    txfm2d_func_sse4_1(input, output_sse4_1, cfg.txfm_size, &cfg, 10);
     for (int r = 0; r < txfm_size; r++) {
       for (int c = 0; c < txfm_size; c++) {
-        EXPECT_EQ(output_c[r * txfm_size + c], output_sse2[r * txfm_size + c]);
+        EXPECT_EQ(output_c[r * txfm_size + c],
+                  output_sse4_1[r * txfm_size + c]);
       }
     }
   }
index ec619c3ab3cc4ab73dddec39dc6275efd32b6a3d..7b20239389a396f3928cf5436138fd4defd3cb30 100644 (file)
@@ -615,15 +615,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #fwd txfm
   add_proto qw/void vp10_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_4x4 sse2/;
+  specialize qw/vp10_fwd_txfm2d_4x4 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_8x8 sse2/;
+  specialize qw/vp10_fwd_txfm2d_8x8 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_16x16 sse2/;
+  specialize qw/vp10_fwd_txfm2d_16x16 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_32x32 sse2/;
+  specialize qw/vp10_fwd_txfm2d_32x32 sse4_1/;
   add_proto qw/void vp10_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
-  specialize qw/vp10_fwd_txfm2d_64x64 sse2/;
+  specialize qw/vp10_fwd_txfm2d_64x64 sse4_1/;
 
   #inv txfm
   add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd";
similarity index 74%
rename from vp10/common/x86/vp10_fwd_txfm1d_sse2.c
rename to vp10/common/x86/vp10_fwd_txfm1d_sse4.c
index fd9e7a3bbf007cb83ef2e6979fbece48d4502132..5ade8bd3f102c14debb8244d6ece83201df5b42d 100644 (file)
@@ -1,7 +1,7 @@
-#include "vp10/common/x86/vp10_txfm1d_sse2.h"
+#include "vp10/common/x86/vp10_txfm1d_sse4.h"
 
-void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output,
-                         const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 4;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -32,10 +32,10 @@ void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1],
-                      bit);
-    btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
 
     // stage 3
     stage_idx++;
@@ -53,8 +53,8 @@ void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output,
-                         const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 8;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -98,18 +98,18 @@ void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output,
     buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
     buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
     buf0[4] = buf1[4];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
-                      bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
     buf0[7] = buf1[7];
 
     // stage 3
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
-                      bit);
-    btf_32_sse2_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
     buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
     buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
     buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
@@ -123,10 +123,10 @@ void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output,
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
     buf0[3] = buf1[3];
-    btf_32_sse2_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
-                      bit);
-    btf_32_sse2_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
-                      bit);
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
 
     // stage 5
     stage_idx++;
@@ -152,8 +152,8 @@ void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 16;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -218,10 +218,10 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
     buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
     buf0[8] = buf1[8];
     buf0[9] = buf1[9];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
-                      buf0[13], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
-                      buf0[12], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
     buf0[14] = buf1[14];
     buf0[15] = buf1[15];
 
@@ -234,8 +234,8 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
     buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
     buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
     buf1[4] = buf0[4];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], buf1[6],
-                      bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
     buf1[7] = buf0[7];
     buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
     buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
@@ -250,19 +250,19 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1],
-                      bit);
-    btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
     buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
     buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
     buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
     buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
     buf0[8] = buf1[8];
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
-                      buf0[14], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
-                      buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
     buf0[11] = buf1[11];
     buf0[12] = buf1[12];
     buf0[15] = buf1[15];
@@ -275,10 +275,10 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
     buf1[1] = buf0[1];
     buf1[2] = buf0[2];
     buf1[3] = buf0[3];
-    btf_32_sse2_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
-                      bit);
-    btf_32_sse2_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], buf1[6],
-                      bit);
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
     buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
     buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
     buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
@@ -300,14 +300,14 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
     buf0[5] = buf1[5];
     buf0[6] = buf1[6];
     buf0[7] = buf1[7];
-    btf_32_sse2_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], buf0[15],
-                      bit);
-    btf_32_sse2_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
-                      buf0[14], bit);
-    btf_32_sse2_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
-                      buf0[13], bit);
-    btf_32_sse2_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
-                      buf0[12], bit);
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+                        buf0[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
 
     // stage 7
     stage_idx++;
@@ -349,8 +349,8 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 32;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -457,14 +457,14 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf0[17] = buf1[17];
     buf0[18] = buf1[18];
     buf0[19] = buf1[19];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
-                      buf0[27], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
-                      buf0[26], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
-                      buf0[25], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
-                      buf0[24], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
     buf0[28] = buf1[28];
     buf0[29] = buf1[29];
     buf0[30] = buf1[30];
@@ -484,10 +484,10 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
     buf1[8] = buf0[8];
     buf1[9] = buf0[9];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
-                      buf1[13], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
-                      buf1[12], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
     buf1[14] = buf0[14];
     buf1[15] = buf0[15];
     buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
@@ -516,8 +516,8 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
     buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
     buf0[4] = buf1[4];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
-                      bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
     buf0[7] = buf1[7];
     buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
     buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
@@ -529,14 +529,14 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
     buf0[16] = buf1[16];
     buf0[17] = buf1[17];
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
-                      buf0[29], bit);
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
-                      buf0[28], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
-                      buf0[27], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
-                      buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
     buf0[22] = buf1[22];
     buf0[23] = buf1[23];
     buf0[24] = buf1[24];
@@ -548,19 +548,19 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
-                      bit);
-    btf_32_sse2_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
     buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
     buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
     buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
     buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
     buf1[8] = buf0[8];
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
-                      buf1[14], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
-                      buf1[13], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
     buf1[11] = buf0[11];
     buf1[12] = buf0[12];
     buf1[15] = buf0[15];
@@ -589,10 +589,10 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
     buf0[3] = buf1[3];
-    btf_32_sse2_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
-                      bit);
-    btf_32_sse2_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
-                      bit);
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
     buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
     buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
     buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
@@ -602,16 +602,16 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
     buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
     buf0[16] = buf1[16];
-    btf_32_sse2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
-                      buf0[30], bit);
-    btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
-                      buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
     buf0[19] = buf1[19];
     buf0[20] = buf1[20];
-    btf_32_sse2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
-                      buf0[26], bit);
-    btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
-                      buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
     buf0[23] = buf1[23];
     buf0[24] = buf1[24];
     buf0[27] = buf1[27];
@@ -630,14 +630,14 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf1[5] = buf0[5];
     buf1[6] = buf0[6];
     buf1[7] = buf0[7];
-    btf_32_sse2_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
-                      bit);
-    btf_32_sse2_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
-                      buf1[14], bit);
-    btf_32_sse2_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
-                      buf1[13], bit);
-    btf_32_sse2_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
-                      buf1[12], bit);
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+                        buf1[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
     buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
     buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
     buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
@@ -675,22 +675,22 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
     buf0[13] = buf1[13];
     buf0[14] = buf1[14];
     buf0[15] = buf1[15];
-    btf_32_sse2_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
-                      buf0[31], bit);
-    btf_32_sse2_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
-                      buf0[30], bit);
-    btf_32_sse2_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
-                      buf0[29], bit);
-    btf_32_sse2_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
-                      buf0[28], bit);
-    btf_32_sse2_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
-                      buf0[27], bit);
-    btf_32_sse2_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
-                      buf0[26], bit);
-    btf_32_sse2_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
-                      buf0[25], bit);
-    btf_32_sse2_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
-                      buf0[24], bit);
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                        buf0[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
 
     // stage 9
     stage_idx++;
@@ -764,8 +764,8 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 4;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -796,10 +796,10 @@ void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
-                      bit);
-    btf_32_sse2_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
 
     // stage 3
     stage_idx++;
@@ -816,8 +816,8 @@ void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output,
     cospi = cospi_arr[bit - cos_bit_min];
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
 
     // stage 5
     stage_idx++;
@@ -835,8 +835,8 @@ void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 8;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -875,14 +875,14 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
-                      bit);
-    btf_32_sse2_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
-    btf_32_sse2_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4], buf0[5],
-                      bit);
-    btf_32_sse2_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
 
     // stage 3
     stage_idx++;
@@ -905,10 +905,10 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output,
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
     buf0[3] = buf1[3];
-    btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5],
-                      bit);
-    btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
 
     // stage 5
     stage_idx++;
@@ -929,12 +929,12 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output,
     cospi = cospi_arr[bit - cos_bit_min];
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
     buf0[4] = buf1[4];
     buf0[5] = buf1[5];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
 
     // stage 7
     stage_idx++;
@@ -960,8 +960,8 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
-                           const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 16;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -1016,22 +1016,22 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
-                      bit);
-    btf_32_sse2_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
-    btf_32_sse2_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4], buf0[5],
-                      bit);
-    btf_32_sse2_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
-    btf_32_sse2_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8], buf0[9],
-                      bit);
-    btf_32_sse2_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
-                      buf0[11], bit);
-    btf_32_sse2_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
-                      buf0[13], bit);
-    btf_32_sse2_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
 
     // stage 3
     stage_idx++;
@@ -1066,14 +1066,14 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
     buf0[5] = buf1[5];
     buf0[6] = buf1[6];
     buf0[7] = buf1[7];
-    btf_32_sse2_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
-                      bit);
-    btf_32_sse2_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
-                      buf0[11], bit);
-    btf_32_sse2_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
-                      buf0[13], bit);
-    btf_32_sse2_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
 
     // stage 5
     stage_idx++;
@@ -1104,18 +1104,18 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
     buf0[3] = buf1[3];
-    btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5],
-                      bit);
-    btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
     buf0[8] = buf1[8];
     buf0[9] = buf1[9];
     buf0[10] = buf1[10];
     buf0[11] = buf1[11];
-    btf_32_sse2_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
-                      buf0[13], bit);
-    btf_32_sse2_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
 
     // stage 7
     stage_idx++;
@@ -1144,20 +1144,20 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
     cospi = cospi_arr[bit - cos_bit_min];
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
     buf0[4] = buf1[4];
     buf0[5] = buf1[5];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
     buf0[8] = buf1[8];
     buf0[9] = buf1[9];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
-                      buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
     buf0[12] = buf1[12];
     buf0[13] = buf1[13];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
 
     // stage 9
     stage_idx++;
@@ -1199,8 +1199,8 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
-                           const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 32;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -1287,38 +1287,38 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
-                      bit);
-    btf_32_sse2_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
-    btf_32_sse2_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
-                      bit);
-    btf_32_sse2_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
-    btf_32_sse2_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8], buf0[9],
-                      bit);
-    btf_32_sse2_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
-                      buf0[11], bit);
-    btf_32_sse2_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
-                      buf0[13], bit);
-    btf_32_sse2_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
-    btf_32_sse2_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
-                      buf0[17], bit);
-    btf_32_sse2_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
-                      buf0[19], bit);
-    btf_32_sse2_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
-                      buf0[21], bit);
-    btf_32_sse2_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
-                      buf0[23], bit);
-    btf_32_sse2_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
-                      buf0[25], bit);
-    btf_32_sse2_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
-                      buf0[27], bit);
-    btf_32_sse2_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
-                      buf0[29], bit);
-    btf_32_sse2_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
-                      buf0[31], bit);
+    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+                        bit);
+    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+                        bit);
+    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
 
     // stage 3
     stage_idx++;
@@ -1377,22 +1377,22 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
     buf0[13] = buf1[13];
     buf0[14] = buf1[14];
     buf0[15] = buf1[15];
-    btf_32_sse2_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
-                      buf0[17], bit);
-    btf_32_sse2_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
-                      buf0[19], bit);
-    btf_32_sse2_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
-                      buf0[21], bit);
-    btf_32_sse2_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
-                      buf0[23], bit);
-    btf_32_sse2_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
-                      buf0[25], bit);
-    btf_32_sse2_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
-                      buf0[27], bit);
-    btf_32_sse2_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
-                      buf0[29], bit);
-    btf_32_sse2_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
-                      buf0[31], bit);
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
 
     // stage 5
     stage_idx++;
@@ -1443,14 +1443,14 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
     buf0[5] = buf1[5];
     buf0[6] = buf1[6];
     buf0[7] = buf1[7];
-    btf_32_sse2_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
-                      bit);
-    btf_32_sse2_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
-                      buf0[11], bit);
-    btf_32_sse2_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
-                      buf0[13], bit);
-    btf_32_sse2_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
     buf0[16] = buf1[16];
     buf0[17] = buf1[17];
     buf0[18] = buf1[18];
@@ -1459,14 +1459,14 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
     buf0[21] = buf1[21];
     buf0[22] = buf1[22];
     buf0[23] = buf1[23];
-    btf_32_sse2_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
-                      buf0[25], bit);
-    btf_32_sse2_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
-                      buf0[27], bit);
-    btf_32_sse2_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
-                      buf0[29], bit);
-    btf_32_sse2_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
-                      buf0[31], bit);
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
 
     // stage 7
     stage_idx++;
@@ -1513,34 +1513,34 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
     buf0[1] = buf1[1];
     buf0[2] = buf1[2];
     buf0[3] = buf1[3];
-    btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5],
-                      bit);
-    btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
     buf0[8] = buf1[8];
     buf0[9] = buf1[9];
     buf0[10] = buf1[10];
     buf0[11] = buf1[11];
-    btf_32_sse2_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
-                      buf0[13], bit);
-    btf_32_sse2_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
     buf0[16] = buf1[16];
     buf0[17] = buf1[17];
     buf0[18] = buf1[18];
     buf0[19] = buf1[19];
-    btf_32_sse2_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
-                      buf0[21], bit);
-    btf_32_sse2_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
-                      buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
     buf0[24] = buf1[24];
     buf0[25] = buf1[25];
     buf0[26] = buf1[26];
     buf0[27] = buf1[27];
-    btf_32_sse2_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
-                      buf0[29], bit);
-    btf_32_sse2_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
-                      buf0[31], bit);
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
 
     // stage 9
     stage_idx++;
@@ -1585,36 +1585,36 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
     cospi = cospi_arr[bit - cos_bit_min];
     buf0[0] = buf1[0];
     buf0[1] = buf1[1];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
     buf0[4] = buf1[4];
     buf0[5] = buf1[5];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
     buf0[8] = buf1[8];
     buf0[9] = buf1[9];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
-                      buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
     buf0[12] = buf1[12];
     buf0[13] = buf1[13];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
-                      buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
     buf0[16] = buf1[16];
     buf0[17] = buf1[17];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
-                      buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
     buf0[20] = buf1[20];
     buf0[21] = buf1[21];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
-                      buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
     buf0[24] = buf1[24];
     buf0[25] = buf1[25];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
-                      buf0[27], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
     buf0[28] = buf1[28];
     buf0[29] = buf1[29];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
-                      buf0[31], bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
 
     // stage 11
     stage_idx++;
@@ -1688,8 +1688,8 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
   }
 }
 
-void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range) {
+void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
   const int txfm_size = 64;
   const int num_per_128 = 4;
   const int32_t* cospi;
@@ -1880,22 +1880,22 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf0[37] = buf1[37];
     buf0[38] = buf1[38];
     buf0[39] = buf1[39];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40],
-                      buf0[55], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41],
-                      buf0[54], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42],
-                      buf0[53], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43],
-                      buf0[52], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44],
-                      buf0[51], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45],
-                      buf0[50], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46],
-                      buf0[49], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47],
-                      buf0[48], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47],
+                        buf0[48], bit);
     buf0[56] = buf1[56];
     buf0[57] = buf1[57];
     buf0[58] = buf1[58];
@@ -1929,14 +1929,14 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf1[17] = buf0[17];
     buf1[18] = buf0[18];
     buf1[19] = buf0[19];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20],
-                      buf1[27], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21],
-                      buf1[26], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22],
-                      buf1[25], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23],
-                      buf1[24], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23],
+                        buf1[24], bit);
     buf1[28] = buf0[28];
     buf1[29] = buf0[29];
     buf1[30] = buf0[30];
@@ -1988,10 +1988,10 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
     buf0[8] = buf1[8];
     buf0[9] = buf1[9];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
-                      buf0[13], bit);
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
-                      buf0[12], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
     buf0[14] = buf1[14];
     buf0[15] = buf1[15];
     buf0[16] = _mm_add_epi32(buf1[16], buf1[23]);
@@ -2014,22 +2014,22 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf0[33] = buf1[33];
     buf0[34] = buf1[34];
     buf0[35] = buf1[35];
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36],
-                      buf0[59], bit);
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37],
-                      buf0[58], bit);
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38],
-                      buf0[57], bit);
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39],
-                      buf0[56], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40],
-                      buf0[55], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41],
-                      buf0[54], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42],
-                      buf0[53], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43],
-                      buf0[52], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39],
+                        buf0[56], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
     buf0[44] = buf1[44];
     buf0[45] = buf1[45];
     buf0[46] = buf1[46];
@@ -2052,8 +2052,8 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
     buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
     buf1[4] = buf0[4];
-    btf_32_sse2_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], buf1[6],
-                      bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
     buf1[7] = buf0[7];
     buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
     buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
@@ -2065,14 +2065,14 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
     buf1[16] = buf0[16];
     buf1[17] = buf0[17];
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18],
-                      buf1[29], bit);
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19],
-                      buf1[28], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20],
-                      buf1[27], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21],
-                      buf1[26], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19],
+                        buf1[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
     buf1[22] = buf0[22];
     buf1[23] = buf0[23];
     buf1[24] = buf0[24];
@@ -2116,19 +2116,19 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     stage_idx++;
     bit = cos_bit[stage_idx];
     cospi = cospi_arr[bit - cos_bit_min];
-    btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1],
-                      bit);
-    btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3],
-                      bit);
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
     buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
     buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
     buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
     buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
     buf0[8] = buf1[8];
-    btf_32_sse2_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
-                      buf0[14], bit);
-    btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
-                      buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
     buf0[11] = buf1[11];
     buf0[12] = buf1[12];
     buf0[15] = buf1[15];
@@ -2150,26 +2150,26 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf0[30] = _mm_add_epi32(buf1[30], buf1[29]);
     buf0[32] = buf1[32];
     buf0[33] = buf1[33];
-    btf_32_sse2_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34],
-                      buf0[61], bit);
-    btf_32_sse2_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35],
-                      buf0[60], bit);
-    btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36],
-                      buf0[59], bit);
-    btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37],
-                      buf0[58], bit);
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35],
+                        buf0[60], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
     buf0[38] = buf1[38];
     buf0[39] = buf1[39];
     buf0[40] = buf1[40];
     buf0[41] = buf1[41];
-    btf_32_sse2_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42],
-                      buf0[53], bit);
-    btf_32_sse2_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43],
-                      buf0[52], bit);
-    btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44],
-                      buf0[51], bit);
-    btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45],
-                      buf0[50], bit);
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
     buf0[46] = buf1[46];
     buf0[47] = buf1[47];
     buf0[48] = buf1[48];
@@ -2189,10 +2189,10 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf1[1] = buf0[1];
     buf1[2] = buf0[2];
     buf1[3] = buf0[3];
-    btf_32_sse2_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
-                      bit);
-    btf_32_sse2_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], buf1[6],
-                      bit);
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
     buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
     buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
     buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
@@ -2202,16 +2202,16 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
     buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
     buf1[16] = buf0[16];
-    btf_32_sse2_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17],
-                      buf1[30], bit);
-    btf_32_sse2_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18],
-                      buf1[29], bit);
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17],
+                        buf1[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
     buf1[19] = buf0[19];
     buf1[20] = buf0[20];
-    btf_32_sse2_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21],
-                      buf1[26], bit);
-    btf_32_sse2_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22],
-                      buf1[25], bit);
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
     buf1[23] = buf0[23];
     buf1[24] = buf0[24];
     buf1[27] = buf0[27];
@@ -2262,14 +2262,14 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf0[5] = buf1[5];
     buf0[6] = buf1[6];
     buf0[7] = buf1[7];
-    btf_32_sse2_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], buf0[15],
-                      bit);
-    btf_32_sse2_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
-                      buf0[14], bit);
-    btf_32_sse2_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
-                      buf0[13], bit);
-    btf_32_sse2_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
-                      buf0[12], bit);
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+                        buf0[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
     buf0[16] = _mm_add_epi32(buf1[16], buf1[17]);
     buf0[17] = _mm_sub_epi32(buf1[16], buf1[17]);
     buf0[18] = _mm_sub_epi32(buf1[19], buf1[18]);
@@ -2287,28 +2287,28 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf0[30] = _mm_sub_epi32(buf1[31], buf1[30]);
     buf0[31] = _mm_add_epi32(buf1[31], buf1[30]);
     buf0[32] = buf1[32];
-    btf_32_sse2_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33],
-                      buf0[62], bit);
-    btf_32_sse2_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34],
-                      buf0[61], bit);
+    btf_32_sse4_1_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33],
+                        buf0[62], bit);
+    btf_32_sse4_1_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
     buf0[35] = buf1[35];
     buf0[36] = buf1[36];
-    btf_32_sse2_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37],
-                      buf0[58], bit);
-    btf_32_sse2_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38],
-                      buf0[57], bit);
+    btf_32_sse4_1_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
     buf0[39] = buf1[39];
     buf0[40] = buf1[40];
-    btf_32_sse2_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41],
-                      buf0[54], bit);
-    btf_32_sse2_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42],
-                      buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
     buf0[43] = buf1[43];
     buf0[44] = buf1[44];
-    btf_32_sse2_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45],
-                      buf0[50], bit);
-    btf_32_sse2_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46],
-                      buf0[49], bit);
+    btf_32_sse4_1_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
     buf0[47] = buf1[47];
     buf0[48] = buf1[48];
     buf0[51] = buf1[51];
@@ -2339,22 +2339,22 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf1[13] = buf0[13];
     buf1[14] = buf0[14];
     buf1[15] = buf0[15];
-    btf_32_sse2_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16],
-                      buf1[31], bit);
-    btf_32_sse2_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17],
-                      buf1[30], bit);
-    btf_32_sse2_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18],
-                      buf1[29], bit);
-    btf_32_sse2_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19],
-                      buf1[28], bit);
-    btf_32_sse2_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20],
-                      buf1[27], bit);
-    btf_32_sse2_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21],
-                      buf1[26], bit);
-    btf_32_sse2_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22],
-                      buf1[25], bit);
-    btf_32_sse2_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23],
-                      buf1[24], bit);
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16],
+                        buf1[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17],
+                        buf1[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19],
+                        buf1[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23],
+                        buf1[24], bit);
     buf1[32] = _mm_add_epi32(buf0[32], buf0[33]);
     buf1[33] = _mm_sub_epi32(buf0[32], buf0[33]);
     buf1[34] = _mm_sub_epi32(buf0[35], buf0[34]);
@@ -2424,38 +2424,38 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
     buf0[29] = buf1[29];
     buf0[30] = buf1[30];
     buf0[31] = buf1[31];
-    btf_32_sse2_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32],
-                      buf0[63], bit);
-    btf_32_sse2_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33],
-                      buf0[62], bit);
-    btf_32_sse2_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34],
-                      buf0[61], bit);
-    btf_32_sse2_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35],
-                      buf0[60], bit);
-    btf_32_sse2_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36],
-                      buf0[59], bit);
-    btf_32_sse2_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37],
-                      buf0[58], bit);
-    btf_32_sse2_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38],
-                      buf0[57], bit);
-    btf_32_sse2_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39],
-                      buf0[56], bit);
-    btf_32_sse2_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40],
-                      buf0[55], bit);
-    btf_32_sse2_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41],
-                      buf0[54], bit);
-    btf_32_sse2_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42],
-                      buf0[53], bit);
-    btf_32_sse2_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43],
-                      buf0[52], bit);
-    btf_32_sse2_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44],
-                      buf0[51], bit);
-    btf_32_sse2_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45],
-                      buf0[50], bit);
-    btf_32_sse2_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46],
-                      buf0[49], bit);
-    btf_32_sse2_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47],
-                      buf0[48], bit);
+    btf_32_sse4_1_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32],
+                        buf0[63], bit);
+    btf_32_sse4_1_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33],
+                        buf0[62], bit);
+    btf_32_sse4_1_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    btf_32_sse4_1_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35],
+                        buf0[60], bit);
+    btf_32_sse4_1_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    btf_32_sse4_1_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39],
+                        buf0[56], bit);
+    btf_32_sse4_1_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    btf_32_sse4_1_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47],
+                        buf0[48], bit);
 
     // stage 11
     stage_idx++;
similarity index 68%
rename from vp10/common/x86/vp10_fwd_txfm2d_sse2.c
rename to vp10/common/x86/vp10_fwd_txfm2d_sse4.c
index 5af682fc2d1dcf4f20b4c8dcddd16c5f4845bad3..6664bd5dc0235e9b03894386d376bd395daba459 100644 (file)
@@ -1,4 +1,4 @@
-#include "vp10/common/x86/vp10_txfm1d_sse2.h"
+#include "vp10/common/x86/vp10_txfm1d_sse4.h"
 
 static inline void int16_array_with_stride_to_int32_array_without_stride(
     const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
@@ -16,31 +16,31 @@ typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
 static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT4:
-      return vp10_fdct4_new_sse2;
+      return vp10_fdct4_new_sse4_1;
       break;
     case TXFM_TYPE_DCT8:
-      return vp10_fdct8_new_sse2;
+      return vp10_fdct8_new_sse4_1;
       break;
     case TXFM_TYPE_DCT16:
-      return vp10_fdct16_new_sse2;
+      return vp10_fdct16_new_sse4_1;
       break;
     case TXFM_TYPE_DCT32:
-      return vp10_fdct32_new_sse2;
+      return vp10_fdct32_new_sse4_1;
       break;
     case TXFM_TYPE_DCT64:
-      return vp10_fdct64_new_sse2;
+      return vp10_fdct64_new_sse4_1;
       break;
     case TXFM_TYPE_ADST4:
-      return vp10_fadst4_new_sse2;
+      return vp10_fadst4_new_sse4_1;
       break;
     case TXFM_TYPE_ADST8:
-      return vp10_fadst8_new_sse2;
+      return vp10_fadst8_new_sse4_1;
       break;
     case TXFM_TYPE_ADST16:
-      return vp10_fadst16_new_sse2;
+      return vp10_fadst16_new_sse4_1;
       break;
     case TXFM_TYPE_ADST32:
-      return vp10_fadst32_new_sse2;
+      return vp10_fadst32_new_sse4_1;
       break;
     default:
       assert(0);
@@ -48,7 +48,7 @@ static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   return NULL;
 }
 
-static inline void fwd_txfm2d_sse2(const int16_t *input, int32_t *output,
+static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
                                    const int stride, const TXFM_2D_CFG *cfg,
                                    int32_t *txfm_buf) {
   const int txfm_size = cfg->txfm_size;
@@ -67,51 +67,51 @@ static inline void fwd_txfm2d_sse2(const int16_t *input, int32_t *output,
 
   int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
                                                         txfm_size);
-  round_shift_array_32_sse2(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
   txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
-  round_shift_array_32_sse2(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
   transpose_32(txfm_size, out_128, buf_128);
   txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
-  round_shift_array_32_sse2(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
   transpose_32(txfm_size, buf_128, out_128);
 }
 
-void vp10_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output,
                               const int stride, const TXFM_2D_CFG *cfg,
                               const int bd) {
   int32_t txfm_buf[16];
   (void)bd;
-  fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
-void vp10_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output,
                               const int stride, const TXFM_2D_CFG *cfg,
                               const int bd) {
   int32_t txfm_buf[64];
   (void)bd;
-  fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
-void vp10_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output,
                                 const int stride, const TXFM_2D_CFG *cfg,
                                 const int bd) {
   int32_t txfm_buf[256];
   (void)bd;
-  fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
-void vp10_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
                                 const int stride, const TXFM_2D_CFG *cfg,
                                 const int bd) {
   int32_t txfm_buf[1024];
   (void)bd;
-  fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
 
-void vp10_fwd_txfm2d_64x64_sse2(const int16_t *input, int32_t *output,
+void vp10_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
                                 const int stride, const TXFM_2D_CFG *cfg,
                                 const int bd) {
   int32_t txfm_buf[4096];
   (void)bd;
-  fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf);
+  fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf);
 }
diff --git a/vp10/common/x86/vp10_txfm1d_sse2.h b/vp10/common/x86/vp10_txfm1d_sse2.h
deleted file mode 100644 (file)
index fc25013..0000000
+++ /dev/null
@@ -1,153 +0,0 @@
-#ifndef VP10_TXMF1D_SSE2_H_
-#define VP10_TXMF1D_SSE2_H_
-
-#include <emmintrin.h>
-#include "vp10/common/vp10_txfm.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output,
-                         const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output,
-                         const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-
-void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output,
-                           const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output,
-                           const int8_t* cos_bit, const int8_t* stage_range);
-
-void vp10_idct4_new_sse2(const __m128i* input, __m128i* output,
-                         const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_idct8_new_sse2(const __m128i* input, __m128i* output,
-                         const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_idct16_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_idct32_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_idct64_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-
-void vp10_iadst4_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_iadst8_new_sse2(const __m128i* input, __m128i* output,
-                          const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_iadst16_new_sse2(const __m128i* input, __m128i* output,
-                           const int8_t* cos_bit, const int8_t* stage_range);
-void vp10_iadst32_new_sse2(const __m128i* input, __m128i* output,
-                           const int8_t* cos_bit, const int8_t* stage_range);
-
-static INLINE void transpose_32_4x4(int stride, const __m128i* input,
-                                    __m128i* output) {
-  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
-  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
-  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
-  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
-
-  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
-  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
-  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
-  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
-}
-
-// the entire input block can be represent by a grid of 4x4 blocks
-// each 4x4 blocks can be represent by 4 vertical __m128i
-// we first transpose each 4x4 block internally
-// than transpose the grid
-static INLINE void transpose_32(int txfm_size, const __m128i* input,
-                                __m128i* output) {
-  const int num_per_128 = 4;
-  const int row_size = txfm_size;
-  const int col_size = txfm_size / num_per_128;
-  int r, c;
-
-  // transpose each 4x4 block internally
-  for (r = 0; r < row_size; r += 4) {
-    for (c = 0; c < col_size; c++) {
-      transpose_32_4x4(col_size, &input[r * col_size + c],
-                       &output[c * 4 * col_size + r / 4]);
-    }
-  }
-}
-
-#define mullo_epi32(a, b)                                                     \
-  ({                                                                          \
-    __m128i tmp1 = _mm_mul_epu32(a, b);                                       \
-    __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); \
-    _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)),      \
-                       _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));     \
-  })
-
-#define round_shift_32_sse2(vec, bit)       \
-  ({                                        \
-    __m128i tmp, round;                     \
-    round = _mm_set1_epi32(1 << (bit - 1)); \
-    tmp = _mm_add_epi32(vec, round);        \
-    _mm_srai_epi32(tmp, bit);               \
-  })
-
-#define round_shift_array_32_sse2(input, output, size, bit) \
-  ({                                                        \
-    if (bit > 0) {                                          \
-      int i;                                                \
-      for (i = 0; i < size; i++) {                          \
-        output[i] = round_shift_32_sse2(input[i], bit);     \
-      }                                                     \
-    } else {                                                \
-      int i;                                                \
-      for (i = 0; i < size; i++) {                          \
-        output[i] = _mm_slli_epi32(input[i], -bit);         \
-      }                                                     \
-    }                                                       \
-  })
-
-// out0 = in0*w0 + in1*w1
-// out1 = -in1*w0 + in0*w1
-#define btf_32_sse2_type0(w0, w1, in0, in1, out0, out1, bit) \
-  ({                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;        \
-    ww0 = _mm_set1_epi32(w0);                                \
-    ww1 = _mm_set1_epi32(w1);                                \
-    in0_w0 = mullo_epi32(in0, ww0);                          \
-    in1_w1 = mullo_epi32(in1, ww1);                          \
-    out0 = _mm_add_epi32(in0_w0, in1_w1);                    \
-    out0 = round_shift_32_sse2(out0, bit);                   \
-    in0_w1 = mullo_epi32(in0, ww1);                          \
-    in1_w0 = mullo_epi32(in1, ww0);                          \
-    out1 = _mm_sub_epi32(in0_w1, in1_w0);                    \
-    out1 = round_shift_32_sse2(out1, bit);                   \
-  })
-
-// out0 = in0*w0 + in1*w1
-// out1 = in1*w0 - in0*w1
-#define btf_32_sse2_type1(w0, w1, in0, in1, out0, out1, bit) \
-  ({                                                         \
-    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;        \
-    ww0 = _mm_set1_epi32(w0);                                \
-    ww1 = _mm_set1_epi32(w1);                                \
-    in0_w0 = mullo_epi32(in0, ww0);                          \
-    in1_w1 = mullo_epi32(in1, ww1);                          \
-    out0 = _mm_add_epi32(in0_w0, in1_w1);                    \
-    out0 = round_shift_32_sse2(out0, bit);                   \
-    in0_w1 = mullo_epi32(in0, ww1);                          \
-    in1_w0 = mullo_epi32(in1, ww0);                          \
-    out1 = _mm_sub_epi32(in1_w0, in0_w1);                    \
-    out1 = round_shift_32_sse2(out1, bit);                   \
-  })
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // VP10_TXMF1D_SSE2_H_
diff --git a/vp10/common/x86/vp10_txfm1d_sse4.h b/vp10/common/x86/vp10_txfm1d_sse4.h
new file mode 100644 (file)
index 0000000..803b86d
--- /dev/null
@@ -0,0 +1,145 @@
+#ifndef VP10_TXMF1D_SSE2_H_
+#define VP10_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "vp10/common/vp10_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_idct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_iadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i* input,
+                                    __m128i* output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// than transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i* input,
+                                __m128i* output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+#define round_shift_32_sse4_1(vec, bit)     \
+  ({                                        \
+    __m128i tmp, round;                     \
+    round = _mm_set1_epi32(1 << (bit - 1)); \
+    tmp = _mm_add_epi32(vec, round);        \
+    _mm_srai_epi32(tmp, bit);               \
+  })
+
+#define round_shift_array_32_sse4_1(input, output, size, bit) \
+  ({                                                          \
+    if (bit > 0) {                                            \
+      int i;                                                  \
+      for (i = 0; i < size; i++) {                            \
+        output[i] = round_shift_32_sse4_1(input[i], bit);     \
+      }                                                       \
+    } else {                                                  \
+      int i;                                                  \
+      for (i = 0; i < size; i++) {                            \
+        output[i] = _mm_slli_epi32(input[i], -bit);           \
+      }                                                       \
+    }                                                         \
+  })
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  ({                                                           \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  })
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  ({                                                           \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  })
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP10_TXMF1D_SSE2_H_
index 84eacadb28c50f9b91f7f60196c41835921524d0..40699a356a3afc37b1b7f2e1e2f4c7583f905e12 100644 (file)
@@ -110,10 +110,9 @@ VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_impl_sse2.h
-VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_txfm1d_sse2.h
-VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm1d_sse2.h
-VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm1d_sse2.c
-VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm2d_sse2.c
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_txfm1d_sse4.h
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm1d_sse4.c
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm2d_sse4.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c