Update for loop increment of idct x86 functions

author Linfeng Zhang <linfengz@google.com>

Fri, 4 Aug 2017 22:29:19 +0000 (15:29 -0700)

committer Linfeng Zhang <linfengz@google.com>

Fri, 4 Aug 2017 22:29:19 +0000 (15:29 -0700)
author Linfeng Zhang <linfengz@google.com>
Fri, 4 Aug 2017 22:29:19 +0000 (15:29 -0700)
committer Linfeng Zhang <linfengz@google.com>
Fri, 4 Aug 2017 22:29:19 +0000 (15:29 -0700)
diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

index 16f03cd142d69802040b26a322cdf0d5f9de07d7..ca771b5f70e49af18f09e3091daced06cc7d15c3 100644 (file)
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -233,10 +233,10 @@ static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
    dc = _mm_set1_epi16(a1);
  
    for (i = 0; i < size; ++i) {
-    for (j = 0; j < (size >> 3); ++j) {
-      d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
+    for (j = 0; j < size; j += 8) {
+      d = _mm_load_si128((const __m128i *)(&dest[j]));
        d = add_clamp(d, dc, bd);
-      _mm_store_si128((__m128i *)(&dest[j * 8]), d);
+      _mm_store_si128((__m128i *)(&dest[j]), d);
      }
      dest += stride;
    }
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c

index f730030c50c6c72f357de34213a225f45dcdc92a..1df1c9d73b3e6164fd5738e17bc687a718069465 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -458,10 +458,10 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
      input += 128;
    }
  
-  for (i = 0; i < 2; i++) {
+  for (i = 0; i < 16; i += 8) {
      int j;
-    transpose_16bit_8x8(l + i * 8, out);
-    transpose_16bit_8x8(r + i * 8, out + 8);
+    transpose_16bit_8x8(l + i, out);
+    transpose_16bit_8x8(r + i, out + 8);
      idct16_8col(out);
  
      for (j = 0; j < 16; ++j) {
@@ -489,9 +489,9 @@ void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
    in[15] = _mm_setzero_si128();
    idct16_8col(in);
  
-  for (i = 0; i < 2; i++) {
+  for (i = 0; i < 16; i += 8) {
      int j;
-    transpose_16bit_8x8(in + i * 8, out);
+    transpose_16bit_8x8(in + i, out);
      out[8] = _mm_setzero_si128();
      out[9] = _mm_setzero_si128();
      out[10] = _mm_setzero_si128();
@@ -525,9 +525,9 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
    idct16x16_10_pass1(in, l);
  
    // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
+  for (i = 0; i < 16; i += 8) {
      int j;
-    idct16x16_10_pass2(l + 8 * i, in);
+    idct16x16_10_pass2(l + i, in);
  
      for (j = 0; j < 16; ++j) {
        write_buffer_8x1(dest + j * stride, in[j]);
@@ -1268,10 +1268,10 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
    col[29] = _mm_sub_epi16(stp1_2, stp1_29);
    col[30] = _mm_sub_epi16(stp1_1, stp1_30);
    col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < 32; i += 8) {
      int j;
      // Transpose 32x8 block to 8x32 block
-    transpose_16bit_8x8(col + i * 8, in);
+    transpose_16bit_8x8(col + i, in);
      IDCT32_34
  
      // 2_D: Calculate the results and store them to destination.
@@ -1588,10 +1588,10 @@ static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
  void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
                                   int stride) {
    __m128i col[128], in[32];
-  int i, j;
+  int i;
  
    // rows
-  for (i = 0; i < 4; ++i) {
+  for (i = 0; i < 4 * 32; i += 32) {
      load_buffer_8x32(input, in);
      input += 32 << 3;
  
@@ -1601,17 +1601,16 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
      transpose_16bit_8x8(in + 16, in + 16);
      transpose_16bit_8x8(in + 24, in + 24);
  
-    idct32_full_8x32(in, col + (i << 5));
+    idct32_full_8x32(in, col + i);
    }
  
    // columns
-  for (i = 0; i < 4; ++i) {
-    j = i << 3;
+  for (i = 0; i < 32; i += 8) {
      // Transpose 32x8 block to 8x32 block
-    transpose_16bit_8x8(col + j, in);
-    transpose_16bit_8x8(col + j + 32, in + 8);
-    transpose_16bit_8x8(col + j + 64, in + 16);
-    transpose_16bit_8x8(col + j + 96, in + 24);
+    transpose_16bit_8x8(col + i, in);
+    transpose_16bit_8x8(col + i + 32, in + 8);
+    transpose_16bit_8x8(col + i + 64, in + 16);
+    transpose_16bit_8x8(col + i + 96, in + 24);
  
      idct32_full_8x32(in, in);
      store_buffer_8x32(in, dest, stride);
diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c

index 0d8d60f4759d7301eb12f58cc4b5e09a514fc899..f9b37feaa828ccab0b48099f3743eefcc0c48360 100644 (file)
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -236,10 +236,10 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
  
    // 1_D: Store 32 intermediate results for each 8x32 block.
    add_sub_butterfly(stp1, col, 32);
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < 32; i += 8) {
      int j;
      // Transpose 32x8 block to 8x32 block
-    transpose_16bit_8x8(col + i * 8, in);
+    transpose_16bit_8x8(col + i, in);
      idct32_34_first_half(in, stp1);
      idct32_34_second_half(in, stp1);
author	Linfeng Zhang <linfengz@google.com>
	Fri, 4 Aug 2017 22:29:19 +0000 (15:29 -0700)
committer	Linfeng Zhang <linfengz@google.com>
	Fri, 4 Aug 2017 22:29:19 +0000 (15:29 -0700)
vpx_dsp/x86/highbd_inv_txfm_sse2.h		patch \| blob \| history
vpx_dsp/x86/inv_txfm_sse2.c		patch \| blob \| history
vpx_dsp/x86/inv_txfm_ssse3.c		patch \| blob \| history