From 11ca81f8b64fe5af24a800ba2cfb0f0d37d56ed5 Mon Sep 17 00:00:00 2001
From: Yunqing Wang <yunqingwang@google.com>
Date: Fri, 8 Mar 2013 10:54:30 -0800
Subject: [PATCH] Add vp9_idct4_1d_sse2

Added SSE2 idct4_1d which is called by vp9_short_iht4x4. Also,
modified the parameter type passed to vp9_short_iht functions to
make it work with rtcd prototype.

Change-Id: I81ba7cb4db6738f1923383b52a06deb760923ffe
---
 vp9/common/vp9_idctllm.c         | 34 ++++++++++++------------
 vp9/common/vp9_rtcd_defs.sh      |  3 +++
 vp9/common/x86/vp9_idctllm_x86.c | 44 +++++++++++++++++++++++++++++---
 3 files changed, 60 insertions(+), 21 deletions(-)

diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 54b79ee64..e2106250f 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -26,6 +26,7 @@
 #include <math.h>
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
@@ -109,7 +110,7 @@ void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
   }
 }
 
-static void idct4_1d(int16_t *input, int16_t *output) {
+void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
   // stage 1
@@ -140,7 +141,7 @@ void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = input[j];
-    idct4_1d(temp_in, outptr);
+    vp9_idct4_1d(temp_in, outptr);
     input += 4;
     outptr += 4;
   }
@@ -149,7 +150,7 @@ void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    idct4_1d(temp_in, temp_out);
+    vp9_idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
       output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
@@ -205,7 +206,7 @@ static void idct8_1d(int16_t *input, int16_t *output) {
   step1[6] = dct_const_round_shift(temp2);
 
   // stage 2 & stage 3 - even half
-  idct4_1d(step1, step1);
+  vp9_idct4_1d(step1, step1);
 
   // stage 2 - odd half
   step2[4] = step1[4] + step1[5];
@@ -298,24 +299,23 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(s3);
 }
 
-static const transform_2d IHT_4[] = {
-  { idct4_1d,  idct4_1d  },  // DCT_DCT  = 0
-  { iadst4_1d, idct4_1d  },  // ADST_DCT = 1
-  { idct4_1d,  iadst4_1d },  // DCT_ADST = 2
-  { iadst4_1d, iadst4_1d }   // ADST_ADST = 3
-};
-
 void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+                        int pitch, int tx_type) {
+  const transform_2d IHT_4[] = {
+    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
+    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
+    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2
+    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3
+  };
+
   int i, j;
   int16_t out[4 * 4];
   int16_t *outptr = out;
   int16_t temp_in[4], temp_out[4];
-  const transform_2d ht = IHT_4[tx_type];
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    ht.rows(input, outptr);
+    IHT_4[tx_type].rows(input, outptr);
     input  += 4;
     outptr += 4;
   }
@@ -324,7 +324,7 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
       output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
   }
@@ -415,7 +415,7 @@ static const transform_2d IHT_8[] = {
 };
 
 void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
-                        int pitch, TX_TYPE tx_type) {
+                        int pitch, int tx_type) {
   int i, j;
   int16_t out[8 * 8];
   int16_t *outptr = out;
@@ -838,7 +838,7 @@ static const transform_2d IHT_16[] = {
 };
 
 void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, TX_TYPE tx_type) {
+                          int pitch, int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 48ae860a9..04b67b925 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -322,6 +322,9 @@ specialize vp9_short_iht4x4
 prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16
 
+prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
+specialize vp9_idct4_1d sse2
+
 # dct and add
 
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c
index 7b3c57967..3d7a1481c 100644
--- a/vp9/common/x86/vp9_idctllm_x86.c
+++ b/vp9/common/x86/vp9_idctllm_x86.c
@@ -77,10 +77,10 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
 void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16((short)cospi_16_64, (short)cospi_16_64,
-                                     (short)cospi_16_64, (short)-cospi_16_64,
-                                     (short)cospi_24_64, (short)-cospi_8_64,
-                                     (short)cospi_8_64, (short)cospi_24_64);
+  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const int half_pitch = pitch >> 1;
   __m128i input0, input1, input2, input3;
@@ -198,4 +198,40 @@ void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) {
   input3 = _mm_srli_si128(input3, 8);
   _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
 }
+
+void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
+
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i in, temp;
+
+  // Load input data.
+  in = _mm_loadl_epi64((__m128i *)input);
+
+  // Construct i3, i1, i3, i1, i2, i0, i2, i0
+  in = _mm_shufflelo_epi16(in, 0xd8);
+  in = _mm_unpacklo_epi32(in, in);
+
+  // Stage 1
+  in = _mm_madd_epi16(in, c1);
+  in = _mm_add_epi32(in, rounding);
+  in = _mm_srai_epi32(in, DCT_CONST_BITS);
+  in = _mm_packs_epi32(in, zero);
+
+  // Stage 2
+  temp = _mm_shufflelo_epi16(in, 0x9c);
+  in = _mm_shufflelo_epi16(in, 0xc9);
+  in = _mm_unpacklo_epi64(temp, in);
+  in = _mm_madd_epi16(in, c2);
+  in = _mm_packs_epi32(in, zero);
+
+  // Store results
+  _mm_storel_epi64((__m128i *)output, in);
+}
+
 #endif
-- 
2.50.1