Update high bitdepth 16x16 idct x86 code

author Linfeng Zhang <linfengz@google.com>

Fri, 4 Aug 2017 22:10:12 +0000 (15:10 -0700)

committer Linfeng Zhang <linfengz@google.com>

Fri, 4 Aug 2017 22:12:33 +0000 (15:12 -0700)
author Linfeng Zhang <linfengz@google.com>
Fri, 4 Aug 2017 22:10:12 +0000 (15:10 -0700)
committer Linfeng Zhang <linfengz@google.com>
Fri, 4 Aug 2017 22:12:33 +0000 (15:12 -0700)
diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

index ccdcc6679037d2bc48055f964cf64db5f1a765ba..ac586c5d1b668d5a843daa0eb3f41df2026c2c8a 100644 (file)
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -16,43 +16,6 @@
  #include "vpx_dsp/x86/transpose_sse2.h"
  #include "vpx_dsp/x86/txfm_common_sse2.h"
  
-static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
-  const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
-  const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
-  return _mm_packs_epi32(t0, t1);
-}
-
-static INLINE void highbd_write_buffer_8x1(uint16_t *dest, const __m128i in,
-                                           const int bd) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  __m128i out;
-
-  out = _mm_adds_epi16(in, final_rounding);
-  out = _mm_srai_epi16(out, 6);
-  recon_and_store_8_kernel(out, &dest, 0, bd);
-}
-
-static INLINE void recon_and_store_4_kernel(const __m128i in,
-                                            uint16_t *const dest,
-                                            const int bd) {
-  __m128i d;
-
-  d = _mm_loadl_epi64((const __m128i *)dest);
-  d = add_clamp(d, in, bd);
-  _mm_storel_epi64((__m128i *)dest, d);
-}
-
-static INLINE void highbd_write_buffer_4x1(uint16_t *const dest,
-                                           const __m128i in, const int bd) {
-  const __m128i final_rounding = _mm_set1_epi32(1 << 5);
-  __m128i out;
-
-  out = _mm_add_epi32(in, final_rounding);
-  out = _mm_srai_epi32(out, 6);
-  out = _mm_packs_epi32(out, out);
-  recon_and_store_4_kernel(out, dest, bd);
-}
-
  static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
                                               __m128i *const out) {
    __m128i temp1[2], temp2, sign[2];
@@ -107,26 +70,6 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
    out[15] = in[15];
  }
  
-static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
-                                             __m128i *const out) {
-  out[0] = _mm_add_epi32(in[0], in[15]);
-  out[1] = _mm_add_epi32(in[1], in[14]);
-  out[2] = _mm_add_epi32(in[2], in[13]);
-  out[3] = _mm_add_epi32(in[3], in[12]);
-  out[4] = _mm_add_epi32(in[4], in[11]);
-  out[5] = _mm_add_epi32(in[5], in[10]);
-  out[6] = _mm_add_epi32(in[6], in[9]);
-  out[7] = _mm_add_epi32(in[7], in[8]);
-  out[8] = _mm_sub_epi32(in[7], in[8]);
-  out[9] = _mm_sub_epi32(in[6], in[9]);
-  out[10] = _mm_sub_epi32(in[5], in[10]);
-  out[11] = _mm_sub_epi32(in[4], in[11]);
-  out[12] = _mm_sub_epi32(in[3], in[12]);
-  out[13] = _mm_sub_epi32(in[2], in[13]);
-  out[14] = _mm_sub_epi32(in[1], in[14]);
-  out[15] = _mm_sub_epi32(in[0], in[15]);
-}
-
  static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
    __m128i step1[16], step2[16];
    __m128i temp1[4], temp2, sign[2];
@@ -314,14 +257,14 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
        input += 128;
      }
  
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i < 16; i += 8) {
        int j;
-      transpose_16bit_8x8(l + i * 8, out);
-      transpose_16bit_8x8(r + i * 8, out + 8);
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
        idct16_8col(out);
  
        for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
        }
        dest += 8;
      }
@@ -354,32 +297,32 @@ void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
        input += 4 * 16;
      }
  
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < 16; i += 4) {
        int j;
-      out[0] = all[0][4 * i + 0];
-      out[1] = all[1][4 * i + 0];
-      out[2] = all[0][4 * i + 1];
-      out[3] = all[1][4 * i + 1];
-      out[4] = all[0][4 * i + 2];
-      out[5] = all[1][4 * i + 2];
-      out[6] = all[0][4 * i + 3];
-      out[7] = all[1][4 * i + 3];
+      out[0] = all[0][i + 0];
+      out[1] = all[1][i + 0];
+      out[2] = all[0][i + 1];
+      out[3] = all[1][i + 1];
+      out[4] = all[0][i + 2];
+      out[5] = all[1][i + 2];
+      out[6] = all[0][i + 3];
+      out[7] = all[1][i + 3];
        transpose_32bit_8x4(out, out);
  
-      out[8] = all[2][4 * i + 0];
-      out[9] = all[3][4 * i + 0];
-      out[10] = all[2][4 * i + 1];
-      out[11] = all[3][4 * i + 1];
-      out[12] = all[2][4 * i + 2];
-      out[13] = all[3][4 * i + 2];
-      out[14] = all[2][4 * i + 3];
-      out[15] = all[3][4 * i + 3];
+      out[8] = all[2][i + 0];
+      out[9] = all[3][i + 0];
+      out[10] = all[2][i + 1];
+      out[11] = all[3][i + 1];
+      out[12] = all[2][i + 2];
+      out[13] = all[3][i + 2];
+      out[14] = all[2][i + 3];
+      out[15] = all[3][i + 3];
        transpose_32bit_8x4(out + 8, out + 8);
  
        highbd_idct16_4col(out);
  
        for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
        }
        dest += 4;
      }
@@ -414,9 +357,9 @@ void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
      in[15] = _mm_setzero_si128();
      idct16_8col(in);
  
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i < 16; i += 8) {
        int j;
-      transpose_16bit_8x8(in + i * 8, out);
+      transpose_16bit_8x8(in + i, out);
        out[8] = _mm_setzero_si128();
        out[9] = _mm_setzero_si128();
        out[10] = _mm_setzero_si128();
@@ -428,7 +371,7 @@ void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
        idct16_8col(out);
  
        for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
        }
        dest += 8;
      }
@@ -450,21 +393,21 @@ void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
        input += 4 * 16;
      }
  
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < 16; i += 4) {
        int j;
-      out[0] = all[0][4 * i + 0];
-      out[1] = all[1][4 * i + 0];
-      out[2] = all[0][4 * i + 1];
-      out[3] = all[1][4 * i + 1];
-      out[4] = all[0][4 * i + 2];
-      out[5] = all[1][4 * i + 2];
-      out[6] = all[0][4 * i + 3];
-      out[7] = all[1][4 * i + 3];
+      out[0] = all[0][i + 0];
+      out[1] = all[1][i + 0];
+      out[2] = all[0][i + 1];
+      out[3] = all[1][i + 1];
+      out[4] = all[0][i + 2];
+      out[5] = all[1][i + 2];
+      out[6] = all[0][i + 3];
+      out[7] = all[1][i + 3];
        transpose_32bit_8x4(out, out);
        highbd_idct16x16_38_4col(out);
  
        for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
        }
        dest += 4;
      }
@@ -486,12 +429,12 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
  
      idct16x16_10_pass1(in, l);
  
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i < 16; i += 8) {
        int j;
-      idct16x16_10_pass2(l + 8 * i, in);
+      idct16x16_10_pass2(l + i, in);
  
        for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_8x1(dest + j * stride, in[j], bd);
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
        }
        dest += 8;
      }
@@ -509,13 +452,13 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
        input += 4 * 16;
      }
  
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < 16; i += 4) {
        int j;
-      transpose_32bit_4x4(&all[0][4 * i], out);
+      transpose_32bit_4x4(&all[0][i], out);
        highbd_idct16x16_10_4col(out);
  
        for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
        }
        dest += 4;
      }
diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c

index 9b953dd360afc9929e1e812db65884b84f8f2340..f5ee7b74e78b33d1722e13074a57d273319b20c6 100644 (file)
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -143,7 +143,7 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
      io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
    }
  
-  recon_and_store_4(io, dest, stride, bd);
+  recon_and_store_4x4(io, dest, stride, bd);
  }
  
  void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c

index d61cae952b611c2c67e4e92ba7e956971008fed6..e1934350b5efc7be1bfe60499ccbbc60bbcb0a00 100644 (file)
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -65,5 +65,5 @@ void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
      io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
    }
  
-  recon_and_store_4(io, dest, stride, bd);
+  recon_and_store_4x4(io, dest, stride, bd);
  }
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c

index a4b19cc5a30c17ebf30fe83762bbe5a5bd29012f..c94c90111db540474ba934323f6e68a86612f2b2 100644 (file)
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -176,7 +176,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
      highbd_idct8x8_final_round(io);
    }
  
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
  }
  
  void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
@@ -219,7 +219,7 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
      highbd_idct8x8_final_round(io);
    }
  
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
  }
  
  void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c

index 80eedec5e38e5a8309add8bd88e9ce4a84c924e9..e918e215d541d0bf45290fd557d395a71629cd0f 100644 (file)
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -178,7 +178,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
      highbd_idct8x8_final_round(io);
    }
  
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
  }
  
  void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
@@ -221,5 +221,5 @@ void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
      highbd_idct8x8_final_round(io);
    }
  
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
  }
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

index 08db94103b964e38867e4abfeae8c049b9aa7470..16f03cd142d69802040b26a322cdf0d5f9de07d7 100644 (file)
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -184,6 +184,27 @@ static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
    io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
    io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
  }
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[15]);
+  out[1] = _mm_add_epi32(in[1], in[14]);
+  out[2] = _mm_add_epi32(in[2], in[13]);
+  out[3] = _mm_add_epi32(in[3], in[12]);
+  out[4] = _mm_add_epi32(in[4], in[11]);
+  out[5] = _mm_add_epi32(in[5], in[10]);
+  out[6] = _mm_add_epi32(in[6], in[9]);
+  out[7] = _mm_add_epi32(in[7], in[8]);
+  out[8] = _mm_sub_epi32(in[7], in[8]);
+  out[9] = _mm_sub_epi32(in[6], in[9]);
+  out[10] = _mm_sub_epi32(in[5], in[10]);
+  out[11] = _mm_sub_epi32(in[4], in[11]);
+  out[12] = _mm_sub_epi32(in[3], in[12]);
+  out[13] = _mm_sub_epi32(in[2], in[13]);
+  out[14] = _mm_sub_epi32(in[1], in[14]);
+  out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
  static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
                                  const int bd) {
    const __m128i zero = _mm_set1_epi16(0);
@@ -221,9 +242,17 @@ static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
    }
  }
  
-static INLINE void recon_and_store_4_dual(const __m128i in,
-                                          uint16_t *const dest,
-                                          const int stride, const int bd) {
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+                                     const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)dest);
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+                                       const int stride, const int bd) {
    __m128i d;
  
    d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
@@ -234,16 +263,15 @@ static INLINE void recon_and_store_4_dual(const __m128i in,
    _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
  }
  
-static INLINE void recon_and_store_4(const __m128i *const in, uint16_t *dest,
-                                     const int stride, const int bd) {
-  recon_and_store_4_dual(in[0], dest, stride, bd);
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_4x2(in[0], dest, stride, bd);
    dest += 2 * stride;
-  recon_and_store_4_dual(in[1], dest, stride, bd);
+  recon_and_store_4x2(in[1], dest, stride, bd);
  }
  
-static INLINE void recon_and_store_8_kernel(const __m128i in,
-                                            uint16_t **const dest,
-                                            const int stride, const int bd) {
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+                                     const int stride, const int bd) {
    __m128i d;
  
    d = _mm_load_si128((const __m128i *)(*dest));
@@ -252,16 +280,43 @@ static INLINE void recon_and_store_8_kernel(const __m128i in,
    *dest += stride;
  }
  
-static INLINE void recon_and_store_8(const __m128i *const in, uint16_t *dest,
-                                     const int stride, const int bd) {
-  recon_and_store_8_kernel(in[0], &dest, stride, bd);
-  recon_and_store_8_kernel(in[1], &dest, stride, bd);
-  recon_and_store_8_kernel(in[2], &dest, stride, bd);
-  recon_and_store_8_kernel(in[3], &dest, stride, bd);
-  recon_and_store_8_kernel(in[4], &dest, stride, bd);
-  recon_and_store_8_kernel(in[5], &dest, stride, bd);
-  recon_and_store_8_kernel(in[6], &dest, stride, bd);
-  recon_and_store_8_kernel(in[7], &dest, stride, bd);
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_8(in[0], &dest, stride, bd);
+  recon_and_store_8(in[1], &dest, stride, bd);
+  recon_and_store_8(in[2], &dest, stride, bd);
+  recon_and_store_8(in[3], &dest, stride, bd);
+  recon_and_store_8(in[4], &dest, stride, bd);
+  recon_and_store_8(in[5], &dest, stride, bd);
+  recon_and_store_8(in[6], &dest, stride, bd);
+  recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+  const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+  const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+  return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+  __m128i out;
+
+  out = _mm_add_epi32(in, final_rounding);
+  out = _mm_srai_epi32(out, 6);
+  out = _mm_packs_epi32(out, out);
+  recon_and_store_4(out, dest, bd);
  }
  
  #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
author	Linfeng Zhang <linfengz@google.com>
	Fri, 4 Aug 2017 22:10:12 +0000 (15:10 -0700)
committer	Linfeng Zhang <linfengz@google.com>
	Fri, 4 Aug 2017 22:12:33 +0000 (15:12 -0700)
vpx_dsp/x86/highbd_idct16x16_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct4x4_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct4x4_add_sse4.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct8x8_add_sse2.c		patch \| blob \| history
vpx_dsp/x86/highbd_idct8x8_add_sse4.c		patch \| blob \| history
vpx_dsp/x86/highbd_inv_txfm_sse2.h		patch \| blob \| history