granicus.if.org Git - libvpx/blob - vpx_dsp/arm/idct16x16_add_neon.c

   1 /*
   2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <arm_neon.h>
  12
  13 #include "./vpx_dsp_rtcd.h"
  14 #include "vpx_dsp/arm/idct_neon.h"
  15 #include "vpx_dsp/txfm_common.h"
  16
  17 #if CONFIG_VP9_HIGHBITDEPTH
  18 static INLINE void idct16x16_256_add_load_tran_low_kernel(
  19     const tran_low_t **input, int16_t **out) {
  20   int16x8_t s;
  21
  22   s = load_tran_low_to_s16q(*input);
  23   vst1q_s16(*out, s);
  24   *input += 8;
  25   *out += 8;
  26 }
  27
  28 static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input,
  29                                                    int16_t *out) {
  30   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  31   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  32   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  33   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  34   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  35   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  36   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  37   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  38   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  39   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  40   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  41   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  42   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  43   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  44   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  45   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  46   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  47   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  48   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  49   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  50   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  51   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  52   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  53   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  54   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  55   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  56   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  57   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  58   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  59   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  60   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  61   idct16x16_256_add_load_tran_low_kernel(&input, &out);
  62 }
  63 #endif  // CONFIG_VP9_HIGHBITDEPTH
  64
  65 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
  66                                 int16x4_t *const d1) {
  67   *d0 = vrshrn_n_s32(t32[0], 14);
  68   *d1 = vrshrn_n_s32(t32[1], 14);
  69 }
  70
  71 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
  72                                    const int16x4_t cospi_2_30_10_22,
  73                                    int16x8_t *const d0, int16x8_t *const d1) {
  74   int32x4_t t32[6];
  75
  76   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
  77   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
  78   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
  79   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
  80   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
  81   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
  82   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
  83   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
  84   idct16x16_add_wrap_low_8x2(t32, d0, d1);
  85 }
  86
  87 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
  88                                    const int16x4_t cospi_4_12_20N_28,
  89                                    int16x8_t *const d0, int16x8_t *const d1) {
  90   int32x4_t t32[6];
  91
  92   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
  93   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
  94   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
  95   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
  96   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
  97   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
  98   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
  99   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
 100   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 101 }
 102
 103 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
 104                                    const int16x4_t cospi_6_26_14_18N,
 105                                    int16x8_t *const d0, int16x8_t *const d1) {
 106   int32x4_t t32[6];
 107
 108   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
 109   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
 110   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0);
 111   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0);
 112   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1);
 113   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1);
 114   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1);
 115   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1);
 116   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 117 }
 118
 119 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
 120                                             const int16x4_t s1,
 121                                             const int16x4_t cospi_0_8_16_24,
 122                                             int32x4_t *const t32) {
 123   t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
 124   t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
 125   t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
 126   t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
 127 }
 128
 129 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
 130                                      const int16x4_t cospi_0_8_16_24,
 131                                      int16x4_t *const d0, int16x4_t *const d1) {
 132   int32x4_t t32[2];
 133
 134   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
 135   wrap_low_4x2(t32, d0, d1);
 136 }
 137
 138 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
 139                                          const int16x4_t cospi_0_8_16_24,
 140                                          int16x4_t *const d0,
 141                                          int16x4_t *const d1) {
 142   int32x4_t t32[2];
 143
 144   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
 145   t32[1] = vnegq_s32(t32[1]);
 146   wrap_low_4x2(t32, d0, d1);
 147 }
 148
 149 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
 150                                     const int16x4_t cospi_2_30_10_22,
 151                                     int16x8_t *const d0, int16x8_t *const d1) {
 152   int32x4_t t32[6];
 153
 154   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
 155   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
 156   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
 157   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
 158   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
 159   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
 160   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
 161   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
 162   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 163 }
 164
 165 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
 166                                     const int16x4_t cospi_4_12_20N_28,
 167                                     int16x8_t *const d0, int16x8_t *const d1) {
 168   int32x4_t t32[6];
 169
 170   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
 171   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
 172   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
 173   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
 174   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
 175   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
 176   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
 177   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
 178   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 179 }
 180
 181 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
 182                                     const int16x4_t cospi_6_26_14_18N,
 183                                     int16x8_t *const d0, int16x8_t *const d1) {
 184   int32x4_t t32[6];
 185
 186   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
 187   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
 188   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2);
 189   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2);
 190   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3);
 191   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3);
 192   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3);
 193   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3);
 194   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 195 }
 196
 197 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
 198                                       const int16x4_t cospi_0_8_16_24,
 199                                       int16x4_t *const d0,
 200                                       int16x4_t *const d1) {
 201   int32x4_t t32[3];
 202
 203   t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
 204   t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
 205   t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
 206   wrap_low_4x2(t32, d0, d1);
 207 }
 208
 209 static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
 210                                      uint8_t *dest, int stride) {
 211   const int16x8_t cospis0 = vld1q_s16(kCospi);
 212   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
 213   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
 214   const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
 215   const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
 216   const int16x4_t cospi_6_26_14_18N = vget_high_s16(cospis1);
 217   int16x8_t in[16], step1[16], step2[16], out[16];
 218
 219   // Load input (16x8)
 220   in[0] = vld1q_s16(input);
 221   input += 8;
 222   in[8] = vld1q_s16(input);
 223   input += 8;
 224   in[1] = vld1q_s16(input);
 225   input += 8;
 226   in[9] = vld1q_s16(input);
 227   input += 8;
 228   in[2] = vld1q_s16(input);
 229   input += 8;
 230   in[10] = vld1q_s16(input);
 231   input += 8;
 232   in[3] = vld1q_s16(input);
 233   input += 8;
 234   in[11] = vld1q_s16(input);
 235   input += 8;
 236   in[4] = vld1q_s16(input);
 237   input += 8;
 238   in[12] = vld1q_s16(input);
 239   input += 8;
 240   in[5] = vld1q_s16(input);
 241   input += 8;
 242   in[13] = vld1q_s16(input);
 243   input += 8;
 244   in[6] = vld1q_s16(input);
 245   input += 8;
 246   in[14] = vld1q_s16(input);
 247   input += 8;
 248   in[7] = vld1q_s16(input);
 249   input += 8;
 250   in[15] = vld1q_s16(input);
 251
 252   // Transpose
 253   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
 254                     &in[7]);
 255   transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
 256                     &in[15]);
 257
 258   // stage 1
 259   step1[0] = in[0 / 2];
 260   step1[1] = in[16 / 2];
 261   step1[2] = in[8 / 2];
 262   step1[3] = in[24 / 2];
 263   step1[4] = in[4 / 2];
 264   step1[5] = in[20 / 2];
 265   step1[6] = in[12 / 2];
 266   step1[7] = in[28 / 2];
 267   step1[8] = in[2 / 2];
 268   step1[9] = in[18 / 2];
 269   step1[10] = in[10 / 2];
 270   step1[11] = in[26 / 2];
 271   step1[12] = in[6 / 2];
 272   step1[13] = in[22 / 2];
 273   step1[14] = in[14 / 2];
 274   step1[15] = in[30 / 2];
 275
 276   // stage 2
 277   step2[0] = step1[0];
 278   step2[1] = step1[1];
 279   step2[2] = step1[2];
 280   step2[3] = step1[3];
 281   step2[4] = step1[4];
 282   step2[5] = step1[5];
 283   step2[6] = step1[6];
 284   step2[7] = step1[7];
 285   idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
 286   idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9],
 287                    &step2[14]);
 288   idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
 289                    &step2[13]);
 290   idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11],
 291                   &step2[12]);
 292
 293   // stage 3
 294   step1[0] = step2[0];
 295   step1[1] = step2[1];
 296   step1[2] = step2[2];
 297   step1[3] = step2[3];
 298   idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
 299   idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
 300   step1[8] = vaddq_s16(step2[8], step2[9]);
 301   step1[9] = vsubq_s16(step2[8], step2[9]);
 302   step1[10] = vsubq_s16(step2[11], step2[10]);
 303   step1[11] = vaddq_s16(step2[11], step2[10]);
 304   step1[12] = vaddq_s16(step2[12], step2[13]);
 305   step1[13] = vsubq_s16(step2[12], step2[13]);
 306   step1[14] = vsubq_s16(step2[15], step2[14]);
 307   step1[15] = vaddq_s16(step2[15], step2[14]);
 308
 309   // stage 4
 310   idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
 311   idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
 312   step2[4] = vaddq_s16(step1[4], step1[5]);
 313   step2[5] = vsubq_s16(step1[4], step1[5]);
 314   step2[6] = vsubq_s16(step1[7], step1[6]);
 315   step2[7] = vaddq_s16(step1[7], step1[6]);
 316   step2[8] = step1[8];
 317   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
 318                     &step2[14]);
 319   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
 320                         &step2[10]);
 321   step2[11] = step1[11];
 322   step2[12] = step1[12];
 323   step2[15] = step1[15];
 324
 325   // stage 5
 326   step1[0] = vaddq_s16(step2[0], step2[3]);
 327   step1[1] = vaddq_s16(step2[1], step2[2]);
 328   step1[2] = vsubq_s16(step2[1], step2[2]);
 329   step1[3] = vsubq_s16(step2[0], step2[3]);
 330   step1[4] = step2[4];
 331   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
 332   step1[7] = step2[7];
 333   step1[8] = vaddq_s16(step2[8], step2[11]);
 334   step1[9] = vaddq_s16(step2[9], step2[10]);
 335   step1[10] = vsubq_s16(step2[9], step2[10]);
 336   step1[11] = vsubq_s16(step2[8], step2[11]);
 337   step1[12] = vsubq_s16(step2[15], step2[12]);
 338   step1[13] = vsubq_s16(step2[14], step2[13]);
 339   step1[14] = vaddq_s16(step2[14], step2[13]);
 340   step1[15] = vaddq_s16(step2[15], step2[12]);
 341
 342   // stage 6
 343   step2[0] = vaddq_s16(step1[0], step1[7]);
 344   step2[1] = vaddq_s16(step1[1], step1[6]);
 345   step2[2] = vaddq_s16(step1[2], step1[5]);
 346   step2[3] = vaddq_s16(step1[3], step1[4]);
 347   step2[4] = vsubq_s16(step1[3], step1[4]);
 348   step2[5] = vsubq_s16(step1[2], step1[5]);
 349   step2[6] = vsubq_s16(step1[1], step1[6]);
 350   step2[7] = vsubq_s16(step1[0], step1[7]);
 351   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
 352                      &step2[13]);
 353   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
 354                      &step2[12]);
 355   step2[8] = step1[8];
 356   step2[9] = step1[9];
 357   step2[14] = step1[14];
 358   step2[15] = step1[15];
 359
 360   // stage 7
 361   out[0] = vaddq_s16(step2[0], step2[15]);
 362   out[1] = vaddq_s16(step2[1], step2[14]);
 363   out[2] = vaddq_s16(step2[2], step2[13]);
 364   out[3] = vaddq_s16(step2[3], step2[12]);
 365   out[4] = vaddq_s16(step2[4], step2[11]);
 366   out[5] = vaddq_s16(step2[5], step2[10]);
 367   out[6] = vaddq_s16(step2[6], step2[9]);
 368   out[7] = vaddq_s16(step2[7], step2[8]);
 369   out[8] = vsubq_s16(step2[7], step2[8]);
 370   out[9] = vsubq_s16(step2[6], step2[9]);
 371   out[10] = vsubq_s16(step2[5], step2[10]);
 372   out[11] = vsubq_s16(step2[4], step2[11]);
 373   out[12] = vsubq_s16(step2[3], step2[12]);
 374   out[13] = vsubq_s16(step2[2], step2[13]);
 375   out[14] = vsubq_s16(step2[1], step2[14]);
 376   out[15] = vsubq_s16(step2[0], step2[15]);
 377
 378   if (output) {
 379     // pass 1: save the result into output
 380     vst1q_s16(output, out[0]);
 381     output += 16;
 382     vst1q_s16(output, out[1]);
 383     output += 16;
 384     vst1q_s16(output, out[2]);
 385     output += 16;
 386     vst1q_s16(output, out[3]);
 387     output += 16;
 388     vst1q_s16(output, out[4]);
 389     output += 16;
 390     vst1q_s16(output, out[5]);
 391     output += 16;
 392     vst1q_s16(output, out[6]);
 393     output += 16;
 394     vst1q_s16(output, out[7]);
 395     output += 16;
 396     vst1q_s16(output, out[8]);
 397     output += 16;
 398     vst1q_s16(output, out[9]);
 399     output += 16;
 400     vst1q_s16(output, out[10]);
 401     output += 16;
 402     vst1q_s16(output, out[11]);
 403     output += 16;
 404     vst1q_s16(output, out[12]);
 405     output += 16;
 406     vst1q_s16(output, out[13]);
 407     output += 16;
 408     vst1q_s16(output, out[14]);
 409     output += 16;
 410     vst1q_s16(output, out[15]);
 411   } else {
 412     // pass 2: add the result to dest.
 413     idct16x16_add8x1(out[0], &dest, stride);
 414     idct16x16_add8x1(out[1], &dest, stride);
 415     idct16x16_add8x1(out[2], &dest, stride);
 416     idct16x16_add8x1(out[3], &dest, stride);
 417     idct16x16_add8x1(out[4], &dest, stride);
 418     idct16x16_add8x1(out[5], &dest, stride);
 419     idct16x16_add8x1(out[6], &dest, stride);
 420     idct16x16_add8x1(out[7], &dest, stride);
 421     idct16x16_add8x1(out[8], &dest, stride);
 422     idct16x16_add8x1(out[9], &dest, stride);
 423     idct16x16_add8x1(out[10], &dest, stride);
 424     idct16x16_add8x1(out[11], &dest, stride);
 425     idct16x16_add8x1(out[12], &dest, stride);
 426     idct16x16_add8x1(out[13], &dest, stride);
 427     idct16x16_add8x1(out[14], &dest, stride);
 428     idct16x16_add8x1(out[15], &dest, stride);
 429   }
 430 }
 431
 432 static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
 433                                           int16_t *output) {
 434   const int16x8_t cospis0 = vld1q_s16(kCospi);
 435   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
 436   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
 437   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
 438   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
 439   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
 440   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
 441   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
 442   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
 443   int16x4_t in[4], step1[16], step2[16], out[16];
 444
 445 // Load input (4x4)
 446 #if CONFIG_VP9_HIGHBITDEPTH
 447   in[0] = load_tran_low_to_s16d(input);
 448   input += 16;
 449   in[1] = load_tran_low_to_s16d(input);
 450   input += 16;
 451   in[2] = load_tran_low_to_s16d(input);
 452   input += 16;
 453   in[3] = load_tran_low_to_s16d(input);
 454 #else
 455   in[0] = vld1_s16(input);
 456   input += 16;
 457   in[1] = vld1_s16(input);
 458   input += 16;
 459   in[2] = vld1_s16(input);
 460   input += 16;
 461   in[3] = vld1_s16(input);
 462 #endif  // CONFIG_VP9_HIGHBITDEPTH
 463
 464   // Transpose
 465   transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
 466
 467   // stage 1
 468   step1[0] = in[0 / 2];
 469   step1[4] = in[4 / 2];
 470   step1[8] = in[2 / 2];
 471   step1[12] = in[6 / 2];
 472
 473   // stage 2
 474   step2[0] = step1[0];
 475   step2[4] = step1[4];
 476   step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
 477   step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
 478   step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
 479   step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
 480
 481   // stage 3
 482   step1[0] = step2[0];
 483   step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
 484   step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
 485   step1[8] = step2[8];
 486   step1[9] = step2[8];
 487   step1[10] = step2[11];
 488   step1[11] = step2[11];
 489   step1[12] = step2[12];
 490   step1[13] = step2[12];
 491   step1[14] = step2[15];
 492   step1[15] = step2[15];
 493
 494   // stage 4
 495   step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
 496   step2[4] = step1[4];
 497   step2[5] = step1[4];
 498   step2[6] = step1[7];
 499   step2[7] = step1[7];
 500   step2[8] = step1[8];
 501   idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
 502                     &step2[14]);
 503   idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
 504                         &step2[10]);
 505   step2[11] = step1[11];
 506   step2[12] = step1[12];
 507   step2[15] = step1[15];
 508
 509   // stage 5
 510   step1[0] = step2[0];
 511   step1[1] = step2[1];
 512   step1[2] = step2[1];
 513   step1[3] = step2[0];
 514   step1[4] = step2[4];
 515   idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
 516   step1[7] = step2[7];
 517   step1[8] = vadd_s16(step2[8], step2[11]);
 518   step1[9] = vadd_s16(step2[9], step2[10]);
 519   step1[10] = vsub_s16(step2[9], step2[10]);
 520   step1[11] = vsub_s16(step2[8], step2[11]);
 521   step1[12] = vsub_s16(step2[15], step2[12]);
 522   step1[13] = vsub_s16(step2[14], step2[13]);
 523   step1[14] = vadd_s16(step2[14], step2[13]);
 524   step1[15] = vadd_s16(step2[15], step2[12]);
 525
 526   // stage 6
 527   step2[0] = vadd_s16(step1[0], step1[7]);
 528   step2[1] = vadd_s16(step1[1], step1[6]);
 529   step2[2] = vadd_s16(step1[2], step1[5]);
 530   step2[3] = vadd_s16(step1[3], step1[4]);
 531   step2[4] = vsub_s16(step1[3], step1[4]);
 532   step2[5] = vsub_s16(step1[2], step1[5]);
 533   step2[6] = vsub_s16(step1[1], step1[6]);
 534   step2[7] = vsub_s16(step1[0], step1[7]);
 535   idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
 536                      &step2[13]);
 537   idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
 538                      &step2[12]);
 539   step2[8] = step1[8];
 540   step2[9] = step1[9];
 541   step2[14] = step1[14];
 542   step2[15] = step1[15];
 543
 544   // stage 7
 545   out[0] = vadd_s16(step2[0], step2[15]);
 546   out[1] = vadd_s16(step2[1], step2[14]);
 547   out[2] = vadd_s16(step2[2], step2[13]);
 548   out[3] = vadd_s16(step2[3], step2[12]);
 549   out[4] = vadd_s16(step2[4], step2[11]);
 550   out[5] = vadd_s16(step2[5], step2[10]);
 551   out[6] = vadd_s16(step2[6], step2[9]);
 552   out[7] = vadd_s16(step2[7], step2[8]);
 553   out[8] = vsub_s16(step2[7], step2[8]);
 554   out[9] = vsub_s16(step2[6], step2[9]);
 555   out[10] = vsub_s16(step2[5], step2[10]);
 556   out[11] = vsub_s16(step2[4], step2[11]);
 557   out[12] = vsub_s16(step2[3], step2[12]);
 558   out[13] = vsub_s16(step2[2], step2[13]);
 559   out[14] = vsub_s16(step2[1], step2[14]);
 560   out[15] = vsub_s16(step2[0], step2[15]);
 561
 562   // pass 1: save the result into output
 563   vst1_s16(output, out[0]);
 564   output += 4;
 565   vst1_s16(output, out[1]);
 566   output += 4;
 567   vst1_s16(output, out[2]);
 568   output += 4;
 569   vst1_s16(output, out[3]);
 570   output += 4;
 571   vst1_s16(output, out[4]);
 572   output += 4;
 573   vst1_s16(output, out[5]);
 574   output += 4;
 575   vst1_s16(output, out[6]);
 576   output += 4;
 577   vst1_s16(output, out[7]);
 578   output += 4;
 579   vst1_s16(output, out[8]);
 580   output += 4;
 581   vst1_s16(output, out[9]);
 582   output += 4;
 583   vst1_s16(output, out[10]);
 584   output += 4;
 585   vst1_s16(output, out[11]);
 586   output += 4;
 587   vst1_s16(output, out[12]);
 588   output += 4;
 589   vst1_s16(output, out[13]);
 590   output += 4;
 591   vst1_s16(output, out[14]);
 592   output += 4;
 593   vst1_s16(output, out[15]);
 594 }
 595
 596 static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
 597                                           uint8_t *dest, int stride) {
 598   const int16x8_t cospis0 = vld1q_s16(kCospi);
 599   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
 600   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
 601   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
 602   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
 603   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
 604   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
 605   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
 606   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
 607   int16x4_t ind[8];
 608   int16x8_t in[4], step1[16], step2[16], out[16];
 609
 610   // Load input (4x8)
 611   ind[0] = vld1_s16(input);
 612   input += 4;
 613   ind[1] = vld1_s16(input);
 614   input += 4;
 615   ind[2] = vld1_s16(input);
 616   input += 4;
 617   ind[3] = vld1_s16(input);
 618   input += 4;
 619   ind[4] = vld1_s16(input);
 620   input += 4;
 621   ind[5] = vld1_s16(input);
 622   input += 4;
 623   ind[6] = vld1_s16(input);
 624   input += 4;
 625   ind[7] = vld1_s16(input);
 626
 627   // Transpose
 628   transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
 629                     ind[7], &in[0], &in[1], &in[2], &in[3]);
 630
 631   // stage 1
 632   step1[0] = in[0 / 2];
 633   step1[4] = in[4 / 2];
 634   step1[8] = in[2 / 2];
 635   step1[12] = in[6 / 2];
 636
 637   // stage 2
 638   step2[0] = step1[0];
 639   step2[4] = step1[4];
 640   step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
 641   step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
 642   step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
 643   step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
 644
 645   // stage 3
 646   step1[0] = step2[0];
 647   step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
 648   step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
 649   step1[8] = step2[8];
 650   step1[9] = step2[8];
 651   step1[10] = step2[11];
 652   step1[11] = step2[11];
 653   step1[12] = step2[12];
 654   step1[13] = step2[12];
 655   step1[14] = step2[15];
 656   step1[15] = step2[15];
 657
 658   // stage 4
 659   step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
 660   step2[4] = step1[4];
 661   step2[5] = step1[4];
 662   step2[6] = step1[7];
 663   step2[7] = step1[7];
 664   step2[8] = step1[8];
 665   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
 666                     &step2[14]);
 667   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
 668                         &step2[10]);
 669   step2[11] = step1[11];
 670   step2[12] = step1[12];
 671   step2[15] = step1[15];
 672
 673   // stage 5
 674   step1[0] = step2[0];
 675   step1[1] = step2[1];
 676   step1[2] = step2[1];
 677   step1[3] = step2[0];
 678   step1[4] = step2[4];
 679   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
 680   step1[7] = step2[7];
 681   step1[8] = vaddq_s16(step2[8], step2[11]);
 682   step1[9] = vaddq_s16(step2[9], step2[10]);
 683   step1[10] = vsubq_s16(step2[9], step2[10]);
 684   step1[11] = vsubq_s16(step2[8], step2[11]);
 685   step1[12] = vsubq_s16(step2[15], step2[12]);
 686   step1[13] = vsubq_s16(step2[14], step2[13]);
 687   step1[14] = vaddq_s16(step2[14], step2[13]);
 688   step1[15] = vaddq_s16(step2[15], step2[12]);
 689
 690   // stage 6
 691   step2[0] = vaddq_s16(step1[0], step1[7]);
 692   step2[1] = vaddq_s16(step1[1], step1[6]);
 693   step2[2] = vaddq_s16(step1[2], step1[5]);
 694   step2[3] = vaddq_s16(step1[3], step1[4]);
 695   step2[4] = vsubq_s16(step1[3], step1[4]);
 696   step2[5] = vsubq_s16(step1[2], step1[5]);
 697   step2[6] = vsubq_s16(step1[1], step1[6]);
 698   step2[7] = vsubq_s16(step1[0], step1[7]);
 699   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
 700                      &step2[13]);
 701   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
 702                      &step2[12]);
 703   step2[8] = step1[8];
 704   step2[9] = step1[9];
 705   step2[14] = step1[14];
 706   step2[15] = step1[15];
 707
 708   // stage 7
 709   out[0] = vaddq_s16(step2[0], step2[15]);
 710   out[1] = vaddq_s16(step2[1], step2[14]);
 711   out[2] = vaddq_s16(step2[2], step2[13]);
 712   out[3] = vaddq_s16(step2[3], step2[12]);
 713   out[4] = vaddq_s16(step2[4], step2[11]);
 714   out[5] = vaddq_s16(step2[5], step2[10]);
 715   out[6] = vaddq_s16(step2[6], step2[9]);
 716   out[7] = vaddq_s16(step2[7], step2[8]);
 717   out[8] = vsubq_s16(step2[7], step2[8]);
 718   out[9] = vsubq_s16(step2[6], step2[9]);
 719   out[10] = vsubq_s16(step2[5], step2[10]);
 720   out[11] = vsubq_s16(step2[4], step2[11]);
 721   out[12] = vsubq_s16(step2[3], step2[12]);
 722   out[13] = vsubq_s16(step2[2], step2[13]);
 723   out[14] = vsubq_s16(step2[1], step2[14]);
 724   out[15] = vsubq_s16(step2[0], step2[15]);
 725
 726   if (output) {
 727     // pass 1: save the result into output
 728     vst1q_s16(output, out[0]);
 729     output += 16;
 730     vst1q_s16(output, out[1]);
 731     output += 16;
 732     vst1q_s16(output, out[2]);
 733     output += 16;
 734     vst1q_s16(output, out[3]);
 735     output += 16;
 736     vst1q_s16(output, out[4]);
 737     output += 16;
 738     vst1q_s16(output, out[5]);
 739     output += 16;
 740     vst1q_s16(output, out[6]);
 741     output += 16;
 742     vst1q_s16(output, out[7]);
 743     output += 16;
 744     vst1q_s16(output, out[8]);
 745     output += 16;
 746     vst1q_s16(output, out[9]);
 747     output += 16;
 748     vst1q_s16(output, out[10]);
 749     output += 16;
 750     vst1q_s16(output, out[11]);
 751     output += 16;
 752     vst1q_s16(output, out[12]);
 753     output += 16;
 754     vst1q_s16(output, out[13]);
 755     output += 16;
 756     vst1q_s16(output, out[14]);
 757     output += 16;
 758     vst1q_s16(output, out[15]);
 759   } else {
 760     // pass 2: add the result to dest.
 761     idct16x16_add8x1(out[0], &dest, stride);
 762     idct16x16_add8x1(out[1], &dest, stride);
 763     idct16x16_add8x1(out[2], &dest, stride);
 764     idct16x16_add8x1(out[3], &dest, stride);
 765     idct16x16_add8x1(out[4], &dest, stride);
 766     idct16x16_add8x1(out[5], &dest, stride);
 767     idct16x16_add8x1(out[6], &dest, stride);
 768     idct16x16_add8x1(out[7], &dest, stride);
 769     idct16x16_add8x1(out[8], &dest, stride);
 770     idct16x16_add8x1(out[9], &dest, stride);
 771     idct16x16_add8x1(out[10], &dest, stride);
 772     idct16x16_add8x1(out[11], &dest, stride);
 773     idct16x16_add8x1(out[12], &dest, stride);
 774     idct16x16_add8x1(out[13], &dest, stride);
 775     idct16x16_add8x1(out[14], &dest, stride);
 776     idct16x16_add8x1(out[15], &dest, stride);
 777   }
 778 }
 779
 780 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
 781                                 int stride) {
 782   int16_t row_idct_output[16 * 16];
 783
 784 #if CONFIG_VP9_HIGHBITDEPTH
 785   int16_t pass1_input[16 * 16];
 786   idct16x16_256_add_load_tran_low(input, pass1_input);
 787 #else
 788   const int16_t *pass1_input = input;
 789 #endif  // CONFIG_VP9_HIGHBITDEPTH
 790
 791   // pass 1
 792   // Parallel idct on the upper 8 rows
 793   idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride);
 794
 795   // Parallel idct on the lower 8 rows
 796   idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest,
 797                            stride);
 798
 799   // pass 2
 800   // Parallel idct to get the left 8 columns
 801   idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride);
 802
 803   // Parallel idct to get the right 8 columns
 804   idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride);
 805 }
 806
 807 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
 808                                int stride) {
 809   int16_t row_idct_output[4 * 16];
 810
 811   // pass 1
 812   // Parallel idct on the upper 8 rows
 813   idct16x16_10_add_half1d_pass1(input, row_idct_output);
 814
 815   // pass 2
 816   // Parallel idct to get the left 8 columns
 817   idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride);
 818
 819   // Parallel idct to get the right 8 columns
 820   idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
 821                                 stride);
 822 }