From e4985cf619a8071ddc9a1fc9a0b96e8fe30b9d66 Mon Sep 17 00:00:00 2001
From: Linfeng Zhang <linfengz@google.com>
Date: Mon, 30 Jan 2017 12:25:58 -0800
Subject: [PATCH] Update 16x16 8-bit idct NEON intrinsics

Remove redundant memory accesses.

Change-Id: I8049074bdba5f49eab7e735b2b377423a69cd4c8
---
 vpx_dsp/arm/idct16x16_add_neon.c | 184 +++++++++++++------------------
 1 file changed, 77 insertions(+), 107 deletions(-)

diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c
index 0c891919b..e28587207 100644
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -14,54 +14,6 @@
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void idct16x16_256_add_load_tran_low_kernel(
-    const tran_low_t **input, int16_t **out) {
-  int16x8_t s;
-
-  s = load_tran_low_to_s16q(*input);
-  vst1q_s16(*out, s);
-  *input += 8;
-  *out += 8;
-}
-
-static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input,
-                                                   int16_t *out) {
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-  idct16x16_256_add_load_tran_low_kernel(&input, &out);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
                                 int16x4_t *const d1) {
   *d0 = vrshrn_n_s32(t32[0], 14);
@@ -71,7 +23,7 @@ static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
                                    const int16x4_t cospi_2_30_10_22,
                                    int16x8_t *const d0, int16x8_t *const d1) {
-  int32x4_t t32[6];
+  int32x4_t t32[4];
 
   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
@@ -87,7 +39,7 @@ static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
                                    const int16x4_t cospi_4_12_20N_28,
                                    int16x8_t *const d0, int16x8_t *const d1) {
-  int32x4_t t32[6];
+  int32x4_t t32[4];
 
   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
@@ -103,7 +55,7 @@ static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
                                    const int16x4_t cospi_6_26_14_18N,
                                    int16x8_t *const d0, int16x8_t *const d1) {
-  int32x4_t t32[6];
+  int32x4_t t32[4];
 
   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
@@ -149,7 +101,7 @@ static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
                                     const int16x4_t cospi_2_30_10_22,
                                     int16x8_t *const d0, int16x8_t *const d1) {
-  int32x4_t t32[6];
+  int32x4_t t32[4];
 
   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
@@ -165,7 +117,7 @@ static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
                                     const int16x4_t cospi_4_12_20N_28,
                                     int16x8_t *const d0, int16x8_t *const d1) {
-  int32x4_t t32[6];
+  int32x4_t t32[4];
 
   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
@@ -181,7 +133,7 @@ static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
                                     const int16x4_t cospi_6_26_14_18N,
                                     int16x8_t *const d0, int16x8_t *const d1) {
-  int32x4_t t32[6];
+  int32x4_t t32[4];
 
   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
@@ -206,7 +158,7 @@ static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
   wrap_low_4x2(t32, d0, d1);
 }
 
-static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
+static void idct16x16_256_add_half1d(const void *const input, int16_t *output,
                                      uint8_t *dest, int stride) {
   const int16x8_t cospis0 = vld1q_s16(kCospi);
   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
@@ -217,37 +169,73 @@ static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
   int16x8_t in[16], step1[16], step2[16], out[16];
 
   // Load input (16x8)
-  in[0] = vld1q_s16(input);
-  input += 8;
-  in[8] = vld1q_s16(input);
-  input += 8;
-  in[1] = vld1q_s16(input);
-  input += 8;
-  in[9] = vld1q_s16(input);
-  input += 8;
-  in[2] = vld1q_s16(input);
-  input += 8;
-  in[10] = vld1q_s16(input);
-  input += 8;
-  in[3] = vld1q_s16(input);
-  input += 8;
-  in[11] = vld1q_s16(input);
-  input += 8;
-  in[4] = vld1q_s16(input);
-  input += 8;
-  in[12] = vld1q_s16(input);
-  input += 8;
-  in[5] = vld1q_s16(input);
-  input += 8;
-  in[13] = vld1q_s16(input);
-  input += 8;
-  in[6] = vld1q_s16(input);
-  input += 8;
-  in[14] = vld1q_s16(input);
-  input += 8;
-  in[7] = vld1q_s16(input);
-  input += 8;
-  in[15] = vld1q_s16(input);
+  if (output) {
+    const tran_low_t *inputT = (const tran_low_t *)input;
+    in[0] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[8] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[1] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[9] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[2] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[10] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[3] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[11] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[4] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[12] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[5] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[13] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[6] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[14] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[7] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[15] = load_tran_low_to_s16q(inputT);
+  } else {
+    const int16_t *inputT = (const int16_t *)input;
+    in[0] = vld1q_s16(inputT);
+    inputT += 8;
+    in[8] = vld1q_s16(inputT);
+    inputT += 8;
+    in[1] = vld1q_s16(inputT);
+    inputT += 8;
+    in[9] = vld1q_s16(inputT);
+    inputT += 8;
+    in[2] = vld1q_s16(inputT);
+    inputT += 8;
+    in[10] = vld1q_s16(inputT);
+    inputT += 8;
+    in[3] = vld1q_s16(inputT);
+    inputT += 8;
+    in[11] = vld1q_s16(inputT);
+    inputT += 8;
+    in[4] = vld1q_s16(inputT);
+    inputT += 8;
+    in[12] = vld1q_s16(inputT);
+    inputT += 8;
+    in[5] = vld1q_s16(inputT);
+    inputT += 8;
+    in[13] = vld1q_s16(inputT);
+    inputT += 8;
+    in[6] = vld1q_s16(inputT);
+    inputT += 8;
+    in[14] = vld1q_s16(inputT);
+    inputT += 8;
+    in[7] = vld1q_s16(inputT);
+    inputT += 8;
+    in[15] = vld1q_s16(inputT);
+  }
 
   // Transpose
   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
@@ -442,8 +430,7 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
   int16x4_t in[4], step1[16], step2[16], out[16];
 
-// Load input (4x4)
-#if CONFIG_VP9_HIGHBITDEPTH
+  // Load input (4x4)
   in[0] = load_tran_low_to_s16d(input);
   input += 16;
   in[1] = load_tran_low_to_s16d(input);
@@ -451,15 +438,6 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
   in[2] = load_tran_low_to_s16d(input);
   input += 16;
   in[3] = load_tran_low_to_s16d(input);
-#else
-  in[0] = vld1_s16(input);
-  input += 16;
-  in[1] = vld1_s16(input);
-  input += 16;
-  in[2] = vld1_s16(input);
-  input += 16;
-  in[3] = vld1_s16(input);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Transpose
   transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
@@ -781,20 +759,12 @@ void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
   int16_t row_idct_output[16 * 16];
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  int16_t pass1_input[16 * 16];
-  idct16x16_256_add_load_tran_low(input, pass1_input);
-#else
-  const int16_t *pass1_input = input;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
   // pass 1
   // Parallel idct on the upper 8 rows
-  idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride);
+  idct16x16_256_add_half1d(input, row_idct_output, dest, stride);
 
   // Parallel idct on the lower 8 rows
-  idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest,
-                           stride);
+  idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride);
 
   // pass 2
   // Parallel idct to get the left 8 columns
-- 
2.50.1