From: Yi Luo <luoyi@google.com>
Date: Wed, 24 Feb 2016 00:59:38 +0000 (-0800)
Subject: Implemented DST 8x8 with SSE2 intrinsics.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0353f596e9b0763b621d9d15761365f4816f8761;p=libvpx

Implemented DST 8x8 with SSE2 intrinsics.

Implemented fdst8_sse2() function against C version: fdst8().
Added seven DST related hybrid transform types in vp10_fht8x8_sse2().
Replaced vp10_fht8x8_c() with vp10_fht8x8_sse2() in fwd_txfm_8x8().
Speedup: 18.1%, 11.5%, 22.0% based on speed test from
city_cif.y4m, garden_sif.y4m, mobile_cif.y4m.

Change-Id: Ia4aa1ea44c7a33e494f64ce843037f8703f975e3
---

diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index 6507f98f3..83b5df4d6 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -107,8 +107,6 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
-      break;
     case DST_DST:
     case DCT_DST:
     case DST_DCT:
@@ -116,8 +114,7 @@ static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
     case ADST_DST:
     case DST_FLIPADST:
     case FLIPADST_DST:
-      // Use C version since DST exists only in C
-      vp10_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
       break;
     case IDTX:
       fwd_idtx_c(src_diff, coeff, diff_stride, 8);
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
index 1cba80372..79d1e889a 100644
--- a/vp10/encoder/x86/dct_sse2.c
+++ b/vp10/encoder/x86/dct_sse2.c
@@ -1288,6 +1288,155 @@ static void fadst8_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
+#if CONFIG_EXT_TX
+static void fdst8_sse2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t) -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
+
+  s0 = _mm_sub_epi16(in[0], in[7]);
+  s1 = _mm_sub_epi16(in[1], in[6]);  // -s1
+  s2 = _mm_sub_epi16(in[2], in[5]);
+  s3 = _mm_sub_epi16(in[3], in[4]);  // -s3
+  s4 = _mm_add_epi16(in[3], in[4]);  // -s4
+  s5 = _mm_add_epi16(in[2], in[5]);
+  s6 = _mm_add_epi16(in[1], in[6]);  // -s6
+  s7 = _mm_add_epi16(in[0], in[7]);
+
+  x0 = _mm_sub_epi16(s0, s3);
+  x1 = _mm_sub_epi16(s1, s2);  // -x1
+  x2 = _mm_add_epi16(s1, s2);  // -x2
+  x3 = _mm_add_epi16(s0, s3);
+
+  // Interleave
+  t0 = _mm_unpacklo_epi16(x0, x1);
+  t1 = _mm_unpackhi_epi16(x0, x1);
+  t2 = _mm_unpacklo_epi16(x2, x3);
+  t3 = _mm_unpackhi_epi16(x2, x3);
+
+  // Perform butterfly multiplication/addition
+  x0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+  x1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+  x2 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+  x3 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+  x4 = _mm_madd_epi16(t2, k__cospi_m24_p08);
+  x5 = _mm_madd_epi16(t3, k__cospi_m24_p08);
+  x6 = _mm_madd_epi16(t2, k__cospi_p08_p24);
+  x7 = _mm_madd_epi16(t3, k__cospi_p08_p24);
+
+  // Rounding
+  t0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  t1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  t2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  t3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+  t4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+  t5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+  t6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+  t7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+  // Shift
+  x0 = _mm_srai_epi32(t0, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(t1, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(t2, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(t3, DCT_CONST_BITS);
+  x4 = _mm_srai_epi32(t4, DCT_CONST_BITS);
+  x5 = _mm_srai_epi32(t5, DCT_CONST_BITS);
+  x6 = _mm_srai_epi32(t6, DCT_CONST_BITS);
+  x7 = _mm_srai_epi32(t7, DCT_CONST_BITS);
+
+  // Pack 32b integer to 16b with signed saturation
+  in[7] = _mm_packs_epi32(x0, x1);
+  in[5] = _mm_packs_epi32(x4, x5);
+  in[3] = _mm_packs_epi32(x2, x3);
+  in[1] = _mm_packs_epi32(x6, x7);
+
+  // Interleave
+  s0 = _mm_unpacklo_epi16(s6, s5);
+  s1 = _mm_unpackhi_epi16(s6, s5);
+
+  // Perform butterfly multiplication/addition
+  x0 = _mm_madd_epi16(s0, k__cospi_m16_m16);
+  x1 = _mm_madd_epi16(s1, k__cospi_m16_m16);
+  x2 = _mm_madd_epi16(s0, k__cospi_m16_p16);
+  x3 = _mm_madd_epi16(s1, k__cospi_m16_p16);
+
+  // Rounding
+  t0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  t1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  t2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  t3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+
+  // Shift
+  x0 = _mm_srai_epi32(t0, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(t1, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(t2, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(t3, DCT_CONST_BITS);
+
+  // Pack 32b integer to 16b with signed saturation
+  t2 = _mm_packs_epi32(x0, x1);
+  t3 = _mm_packs_epi32(x2, x3);
+
+  x0 = _mm_sub_epi16(t2, s4);
+  x1 = _mm_add_epi16(t2, s4);  // -x1
+  x2 = _mm_sub_epi16(s7, t3);
+  x3 = _mm_add_epi16(s7, t3);
+
+  s0 = _mm_unpacklo_epi16(x0, x3);
+  s1 = _mm_unpackhi_epi16(x0, x3);
+  s2 = _mm_unpacklo_epi16(x1, x2);
+  s3 = _mm_unpackhi_epi16(x1, x2);
+
+  t0 = _mm_madd_epi16(s0, k__cospi_p28_p04);
+  t1 = _mm_madd_epi16(s1, k__cospi_p28_p04);
+  t2 = _mm_madd_epi16(s2, k__cospi_m12_p20);
+  t3 = _mm_madd_epi16(s3, k__cospi_m12_p20);
+  t4 = _mm_madd_epi16(s2, k__cospi_p20_p12);
+  t5 = _mm_madd_epi16(s3, k__cospi_p20_p12);
+  t6 = _mm_madd_epi16(s0, k__cospi_m04_p28);
+  t7 = _mm_madd_epi16(s1, k__cospi_m04_p28);
+
+  // Rounding
+  x0 = _mm_add_epi32(t0, k__DCT_CONST_ROUNDING);
+  x1 = _mm_add_epi32(t1, k__DCT_CONST_ROUNDING);
+  x2 = _mm_add_epi32(t2, k__DCT_CONST_ROUNDING);
+  x3 = _mm_add_epi32(t3, k__DCT_CONST_ROUNDING);
+  x4 = _mm_add_epi32(t4, k__DCT_CONST_ROUNDING);
+  x5 = _mm_add_epi32(t5, k__DCT_CONST_ROUNDING);
+  x6 = _mm_add_epi32(t6, k__DCT_CONST_ROUNDING);
+  x7 = _mm_add_epi32(t7, k__DCT_CONST_ROUNDING);
+  // Shift
+  s0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+  s1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+  s2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+  s3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+  s4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+  s5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+  s6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+  s7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+  in[6] = _mm_packs_epi32(s0, s1);
+  in[4] = _mm_packs_epi32(s4, s5);
+  in[2] = _mm_packs_epi32(s2, s3);
+  in[0] = _mm_packs_epi32(s6, s7);
+
+  // coeffs: [x3 x2 x1 x0, x7 x6 x5 x4]
+  // Transpose
+  array_transpose_8x8(in, in);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[8];
@@ -1353,6 +1502,55 @@ void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
+    case DST_DST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdst8_sse2(in);
+      fdst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_DST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdct8_sse2(in);
+      fdst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DST_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DST_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_DST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fdst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fdst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_DST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fdst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);