granicus.if.org Git - libvpx/blob - vpx_dsp/arm/idct16x16_add_neon.c

   1 /*
   2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <arm_neon.h>
  12
  13 #include "./vpx_dsp_rtcd.h"
  14 #include "vpx_dsp/arm/idct_neon.h"
  15 #include "vpx_dsp/txfm_common.h"
  16
  17 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
  18                                 int16x4_t *const d1) {
  19   *d0 = vrshrn_n_s32(t32[0], 14);
  20   *d1 = vrshrn_n_s32(t32[1], 14);
  21 }
  22
  23 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
  24                                    const int16x4_t cospi_2_30_10_22,
  25                                    int16x8_t *const d0, int16x8_t *const d1) {
  26   int32x4_t t32[4];
  27
  28   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
  29   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
  30   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
  31   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
  32   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
  33   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
  34   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
  35   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
  36   idct16x16_add_wrap_low_8x2(t32, d0, d1);
  37 }
  38
  39 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
  40                                    const int16x4_t cospi_4_12_20N_28,
  41                                    int16x8_t *const d0, int16x8_t *const d1) {
  42   int32x4_t t32[4];
  43
  44   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
  45   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
  46   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
  47   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
  48   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
  49   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
  50   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
  51   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
  52   idct16x16_add_wrap_low_8x2(t32, d0, d1);
  53 }
  54
  55 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
  56                                    const int16x4_t cospi_6_26_14_18N,
  57                                    int16x8_t *const d0, int16x8_t *const d1) {
  58   int32x4_t t32[4];
  59
  60   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
  61   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
  62   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0);
  63   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0);
  64   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1);
  65   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1);
  66   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1);
  67   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1);
  68   idct16x16_add_wrap_low_8x2(t32, d0, d1);
  69 }
  70
  71 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
  72                                             const int16x4_t s1,
  73                                             const int16x4_t cospi_0_8_16_24,
  74                                             int32x4_t *const t32) {
  75   t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3);
  76   t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3);
  77   t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1);
  78   t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1);
  79 }
  80
  81 static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1,
  82                                      const int16x4_t cospi_0_8_16_24,
  83                                      int16x4_t *const d0, int16x4_t *const d1) {
  84   int32x4_t t32[2];
  85
  86   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
  87   wrap_low_4x2(t32, d0, d1);
  88 }
  89
  90 static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
  91                                          const int16x4_t cospi_0_8_16_24,
  92                                          int16x4_t *const d0,
  93                                          int16x4_t *const d1) {
  94   int32x4_t t32[2];
  95
  96   idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32);
  97   t32[1] = vnegq_s32(t32[1]);
  98   wrap_low_4x2(t32, d0, d1);
  99 }
 100
 101 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
 102                                     const int16x4_t cospi_2_30_10_22,
 103                                     int16x8_t *const d0, int16x8_t *const d1) {
 104   int32x4_t t32[4];
 105
 106   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
 107   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
 108   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
 109   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
 110   t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
 111   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
 112   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
 113   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
 114   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 115 }
 116
 117 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
 118                                     const int16x4_t cospi_4_12_20N_28,
 119                                     int16x8_t *const d0, int16x8_t *const d1) {
 120   int32x4_t t32[4];
 121
 122   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
 123   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
 124   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
 125   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
 126   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
 127   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
 128   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
 129   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
 130   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 131 }
 132
 133 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
 134                                     const int16x4_t cospi_6_26_14_18N,
 135                                     int16x8_t *const d0, int16x8_t *const d1) {
 136   int32x4_t t32[4];
 137
 138   t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
 139   t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
 140   t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2);
 141   t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2);
 142   t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3);
 143   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3);
 144   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3);
 145   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3);
 146   idct16x16_add_wrap_low_8x2(t32, d0, d1);
 147 }
 148
 149 static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
 150                                       const int16x4_t cospi_0_8_16_24,
 151                                       int16x4_t *const d0,
 152                                       int16x4_t *const d1) {
 153   int32x4_t t32[3];
 154
 155   t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2);
 156   t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
 157   t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2);
 158   wrap_low_4x2(t32, d0, d1);
 159 }
 160
 161 static void idct16x16_256_add_half1d(const void *const input, int16_t *output,
 162                                      uint8_t *dest, int stride) {
 163   const int16x8_t cospis0 = vld1q_s16(kCospi);
 164   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
 165   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
 166   const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0);
 167   const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1);
 168   const int16x4_t cospi_6_26_14_18N = vget_high_s16(cospis1);
 169   int16x8_t in[16], step1[16], step2[16], out[16];
 170
 171   // Load input (16x8)
 172   if (output) {
 173     const tran_low_t *inputT = (const tran_low_t *)input;
 174     in[0] = load_tran_low_to_s16q(inputT);
 175     inputT += 8;
 176     in[8] = load_tran_low_to_s16q(inputT);
 177     inputT += 8;
 178     in[1] = load_tran_low_to_s16q(inputT);
 179     inputT += 8;
 180     in[9] = load_tran_low_to_s16q(inputT);
 181     inputT += 8;
 182     in[2] = load_tran_low_to_s16q(inputT);
 183     inputT += 8;
 184     in[10] = load_tran_low_to_s16q(inputT);
 185     inputT += 8;
 186     in[3] = load_tran_low_to_s16q(inputT);
 187     inputT += 8;
 188     in[11] = load_tran_low_to_s16q(inputT);
 189     inputT += 8;
 190     in[4] = load_tran_low_to_s16q(inputT);
 191     inputT += 8;
 192     in[12] = load_tran_low_to_s16q(inputT);
 193     inputT += 8;
 194     in[5] = load_tran_low_to_s16q(inputT);
 195     inputT += 8;
 196     in[13] = load_tran_low_to_s16q(inputT);
 197     inputT += 8;
 198     in[6] = load_tran_low_to_s16q(inputT);
 199     inputT += 8;
 200     in[14] = load_tran_low_to_s16q(inputT);
 201     inputT += 8;
 202     in[7] = load_tran_low_to_s16q(inputT);
 203     inputT += 8;
 204     in[15] = load_tran_low_to_s16q(inputT);
 205   } else {
 206     const int16_t *inputT = (const int16_t *)input;
 207     in[0] = vld1q_s16(inputT);
 208     inputT += 8;
 209     in[8] = vld1q_s16(inputT);
 210     inputT += 8;
 211     in[1] = vld1q_s16(inputT);
 212     inputT += 8;
 213     in[9] = vld1q_s16(inputT);
 214     inputT += 8;
 215     in[2] = vld1q_s16(inputT);
 216     inputT += 8;
 217     in[10] = vld1q_s16(inputT);
 218     inputT += 8;
 219     in[3] = vld1q_s16(inputT);
 220     inputT += 8;
 221     in[11] = vld1q_s16(inputT);
 222     inputT += 8;
 223     in[4] = vld1q_s16(inputT);
 224     inputT += 8;
 225     in[12] = vld1q_s16(inputT);
 226     inputT += 8;
 227     in[5] = vld1q_s16(inputT);
 228     inputT += 8;
 229     in[13] = vld1q_s16(inputT);
 230     inputT += 8;
 231     in[6] = vld1q_s16(inputT);
 232     inputT += 8;
 233     in[14] = vld1q_s16(inputT);
 234     inputT += 8;
 235     in[7] = vld1q_s16(inputT);
 236     inputT += 8;
 237     in[15] = vld1q_s16(inputT);
 238   }
 239
 240   // Transpose
 241   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
 242                     &in[7]);
 243   transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
 244                     &in[15]);
 245
 246   // stage 1
 247   step1[0] = in[0 / 2];
 248   step1[1] = in[16 / 2];
 249   step1[2] = in[8 / 2];
 250   step1[3] = in[24 / 2];
 251   step1[4] = in[4 / 2];
 252   step1[5] = in[20 / 2];
 253   step1[6] = in[12 / 2];
 254   step1[7] = in[28 / 2];
 255   step1[8] = in[2 / 2];
 256   step1[9] = in[18 / 2];
 257   step1[10] = in[10 / 2];
 258   step1[11] = in[26 / 2];
 259   step1[12] = in[6 / 2];
 260   step1[13] = in[22 / 2];
 261   step1[14] = in[14 / 2];
 262   step1[15] = in[30 / 2];
 263
 264   // stage 2
 265   step2[0] = step1[0];
 266   step2[1] = step1[1];
 267   step2[2] = step1[2];
 268   step2[3] = step1[3];
 269   step2[4] = step1[4];
 270   step2[5] = step1[5];
 271   step2[6] = step1[6];
 272   step2[7] = step1[7];
 273   idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]);
 274   idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9],
 275                    &step2[14]);
 276   idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
 277                    &step2[13]);
 278   idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11],
 279                   &step2[12]);
 280
 281   // stage 3
 282   step1[0] = step2[0];
 283   step1[1] = step2[1];
 284   step1[2] = step2[2];
 285   step1[3] = step2[3];
 286   idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]);
 287   idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]);
 288   step1[8] = vaddq_s16(step2[8], step2[9]);
 289   step1[9] = vsubq_s16(step2[8], step2[9]);
 290   step1[10] = vsubq_s16(step2[11], step2[10]);
 291   step1[11] = vaddq_s16(step2[11], step2[10]);
 292   step1[12] = vaddq_s16(step2[12], step2[13]);
 293   step1[13] = vsubq_s16(step2[12], step2[13]);
 294   step1[14] = vsubq_s16(step2[15], step2[14]);
 295   step1[15] = vaddq_s16(step2[15], step2[14]);
 296
 297   // stage 4
 298   idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]);
 299   idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]);
 300   step2[4] = vaddq_s16(step1[4], step1[5]);
 301   step2[5] = vsubq_s16(step1[4], step1[5]);
 302   step2[6] = vsubq_s16(step1[7], step1[6]);
 303   step2[7] = vaddq_s16(step1[7], step1[6]);
 304   step2[8] = step1[8];
 305   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
 306                     &step2[14]);
 307   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
 308                         &step2[10]);
 309   step2[11] = step1[11];
 310   step2[12] = step1[12];
 311   step2[15] = step1[15];
 312
 313   // stage 5
 314   step1[0] = vaddq_s16(step2[0], step2[3]);
 315   step1[1] = vaddq_s16(step2[1], step2[2]);
 316   step1[2] = vsubq_s16(step2[1], step2[2]);
 317   step1[3] = vsubq_s16(step2[0], step2[3]);
 318   step1[4] = step2[4];
 319   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
 320   step1[7] = step2[7];
 321   step1[8] = vaddq_s16(step2[8], step2[11]);
 322   step1[9] = vaddq_s16(step2[9], step2[10]);
 323   step1[10] = vsubq_s16(step2[9], step2[10]);
 324   step1[11] = vsubq_s16(step2[8], step2[11]);
 325   step1[12] = vsubq_s16(step2[15], step2[12]);
 326   step1[13] = vsubq_s16(step2[14], step2[13]);
 327   step1[14] = vaddq_s16(step2[14], step2[13]);
 328   step1[15] = vaddq_s16(step2[15], step2[12]);
 329
 330   // stage 6
 331   step2[0] = vaddq_s16(step1[0], step1[7]);
 332   step2[1] = vaddq_s16(step1[1], step1[6]);
 333   step2[2] = vaddq_s16(step1[2], step1[5]);
 334   step2[3] = vaddq_s16(step1[3], step1[4]);
 335   step2[4] = vsubq_s16(step1[3], step1[4]);
 336   step2[5] = vsubq_s16(step1[2], step1[5]);
 337   step2[6] = vsubq_s16(step1[1], step1[6]);
 338   step2[7] = vsubq_s16(step1[0], step1[7]);
 339   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
 340                      &step2[13]);
 341   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
 342                      &step2[12]);
 343   step2[8] = step1[8];
 344   step2[9] = step1[9];
 345   step2[14] = step1[14];
 346   step2[15] = step1[15];
 347
 348   // stage 7
 349   out[0] = vaddq_s16(step2[0], step2[15]);
 350   out[1] = vaddq_s16(step2[1], step2[14]);
 351   out[2] = vaddq_s16(step2[2], step2[13]);
 352   out[3] = vaddq_s16(step2[3], step2[12]);
 353   out[4] = vaddq_s16(step2[4], step2[11]);
 354   out[5] = vaddq_s16(step2[5], step2[10]);
 355   out[6] = vaddq_s16(step2[6], step2[9]);
 356   out[7] = vaddq_s16(step2[7], step2[8]);
 357   out[8] = vsubq_s16(step2[7], step2[8]);
 358   out[9] = vsubq_s16(step2[6], step2[9]);
 359   out[10] = vsubq_s16(step2[5], step2[10]);
 360   out[11] = vsubq_s16(step2[4], step2[11]);
 361   out[12] = vsubq_s16(step2[3], step2[12]);
 362   out[13] = vsubq_s16(step2[2], step2[13]);
 363   out[14] = vsubq_s16(step2[1], step2[14]);
 364   out[15] = vsubq_s16(step2[0], step2[15]);
 365
 366   if (output) {
 367     // pass 1: save the result into output
 368     vst1q_s16(output, out[0]);
 369     output += 16;
 370     vst1q_s16(output, out[1]);
 371     output += 16;
 372     vst1q_s16(output, out[2]);
 373     output += 16;
 374     vst1q_s16(output, out[3]);
 375     output += 16;
 376     vst1q_s16(output, out[4]);
 377     output += 16;
 378     vst1q_s16(output, out[5]);
 379     output += 16;
 380     vst1q_s16(output, out[6]);
 381     output += 16;
 382     vst1q_s16(output, out[7]);
 383     output += 16;
 384     vst1q_s16(output, out[8]);
 385     output += 16;
 386     vst1q_s16(output, out[9]);
 387     output += 16;
 388     vst1q_s16(output, out[10]);
 389     output += 16;
 390     vst1q_s16(output, out[11]);
 391     output += 16;
 392     vst1q_s16(output, out[12]);
 393     output += 16;
 394     vst1q_s16(output, out[13]);
 395     output += 16;
 396     vst1q_s16(output, out[14]);
 397     output += 16;
 398     vst1q_s16(output, out[15]);
 399   } else {
 400     // pass 2: add the result to dest.
 401     idct16x16_add8x1(out[0], &dest, stride);
 402     idct16x16_add8x1(out[1], &dest, stride);
 403     idct16x16_add8x1(out[2], &dest, stride);
 404     idct16x16_add8x1(out[3], &dest, stride);
 405     idct16x16_add8x1(out[4], &dest, stride);
 406     idct16x16_add8x1(out[5], &dest, stride);
 407     idct16x16_add8x1(out[6], &dest, stride);
 408     idct16x16_add8x1(out[7], &dest, stride);
 409     idct16x16_add8x1(out[8], &dest, stride);
 410     idct16x16_add8x1(out[9], &dest, stride);
 411     idct16x16_add8x1(out[10], &dest, stride);
 412     idct16x16_add8x1(out[11], &dest, stride);
 413     idct16x16_add8x1(out[12], &dest, stride);
 414     idct16x16_add8x1(out[13], &dest, stride);
 415     idct16x16_add8x1(out[14], &dest, stride);
 416     idct16x16_add8x1(out[15], &dest, stride);
 417   }
 418 }
 419
 420 static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
 421                                           int16_t *output) {
 422   const int16x8_t cospis0 = vld1q_s16(kCospi);
 423   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
 424   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
 425   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
 426   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
 427   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
 428   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
 429   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
 430   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
 431   int16x4_t in[4], step1[16], step2[16], out[16];
 432
 433   // Load input (4x4)
 434   in[0] = load_tran_low_to_s16d(input);
 435   input += 16;
 436   in[1] = load_tran_low_to_s16d(input);
 437   input += 16;
 438   in[2] = load_tran_low_to_s16d(input);
 439   input += 16;
 440   in[3] = load_tran_low_to_s16d(input);
 441
 442   // Transpose
 443   transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
 444
 445   // stage 1
 446   step1[0] = in[0 / 2];
 447   step1[4] = in[4 / 2];
 448   step1[8] = in[2 / 2];
 449   step1[12] = in[6 / 2];
 450
 451   // stage 2
 452   step2[0] = step1[0];
 453   step2[4] = step1[4];
 454   step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1);
 455   step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1);
 456   step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0);
 457   step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0);
 458
 459   // stage 3
 460   step1[0] = step2[0];
 461   step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3);
 462   step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0);
 463   step1[8] = step2[8];
 464   step1[9] = step2[8];
 465   step1[10] = step2[11];
 466   step1[11] = step2[11];
 467   step1[12] = step2[12];
 468   step1[13] = step2[12];
 469   step1[14] = step2[15];
 470   step1[15] = step2[15];
 471
 472   // stage 4
 473   step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2);
 474   step2[4] = step1[4];
 475   step2[5] = step1[4];
 476   step2[6] = step1[7];
 477   step2[7] = step1[7];
 478   step2[8] = step1[8];
 479   idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
 480                     &step2[14]);
 481   idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
 482                         &step2[10]);
 483   step2[11] = step1[11];
 484   step2[12] = step1[12];
 485   step2[15] = step1[15];
 486
 487   // stage 5
 488   step1[0] = step2[0];
 489   step1[1] = step2[1];
 490   step1[2] = step2[1];
 491   step1[3] = step2[0];
 492   step1[4] = step2[4];
 493   idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
 494   step1[7] = step2[7];
 495   step1[8] = vadd_s16(step2[8], step2[11]);
 496   step1[9] = vadd_s16(step2[9], step2[10]);
 497   step1[10] = vsub_s16(step2[9], step2[10]);
 498   step1[11] = vsub_s16(step2[8], step2[11]);
 499   step1[12] = vsub_s16(step2[15], step2[12]);
 500   step1[13] = vsub_s16(step2[14], step2[13]);
 501   step1[14] = vadd_s16(step2[14], step2[13]);
 502   step1[15] = vadd_s16(step2[15], step2[12]);
 503
 504   // stage 6
 505   step2[0] = vadd_s16(step1[0], step1[7]);
 506   step2[1] = vadd_s16(step1[1], step1[6]);
 507   step2[2] = vadd_s16(step1[2], step1[5]);
 508   step2[3] = vadd_s16(step1[3], step1[4]);
 509   step2[4] = vsub_s16(step1[3], step1[4]);
 510   step2[5] = vsub_s16(step1[2], step1[5]);
 511   step2[6] = vsub_s16(step1[1], step1[6]);
 512   step2[7] = vsub_s16(step1[0], step1[7]);
 513   idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
 514                      &step2[13]);
 515   idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
 516                      &step2[12]);
 517   step2[8] = step1[8];
 518   step2[9] = step1[9];
 519   step2[14] = step1[14];
 520   step2[15] = step1[15];
 521
 522   // stage 7
 523   out[0] = vadd_s16(step2[0], step2[15]);
 524   out[1] = vadd_s16(step2[1], step2[14]);
 525   out[2] = vadd_s16(step2[2], step2[13]);
 526   out[3] = vadd_s16(step2[3], step2[12]);
 527   out[4] = vadd_s16(step2[4], step2[11]);
 528   out[5] = vadd_s16(step2[5], step2[10]);
 529   out[6] = vadd_s16(step2[6], step2[9]);
 530   out[7] = vadd_s16(step2[7], step2[8]);
 531   out[8] = vsub_s16(step2[7], step2[8]);
 532   out[9] = vsub_s16(step2[6], step2[9]);
 533   out[10] = vsub_s16(step2[5], step2[10]);
 534   out[11] = vsub_s16(step2[4], step2[11]);
 535   out[12] = vsub_s16(step2[3], step2[12]);
 536   out[13] = vsub_s16(step2[2], step2[13]);
 537   out[14] = vsub_s16(step2[1], step2[14]);
 538   out[15] = vsub_s16(step2[0], step2[15]);
 539
 540   // pass 1: save the result into output
 541   vst1_s16(output, out[0]);
 542   output += 4;
 543   vst1_s16(output, out[1]);
 544   output += 4;
 545   vst1_s16(output, out[2]);
 546   output += 4;
 547   vst1_s16(output, out[3]);
 548   output += 4;
 549   vst1_s16(output, out[4]);
 550   output += 4;
 551   vst1_s16(output, out[5]);
 552   output += 4;
 553   vst1_s16(output, out[6]);
 554   output += 4;
 555   vst1_s16(output, out[7]);
 556   output += 4;
 557   vst1_s16(output, out[8]);
 558   output += 4;
 559   vst1_s16(output, out[9]);
 560   output += 4;
 561   vst1_s16(output, out[10]);
 562   output += 4;
 563   vst1_s16(output, out[11]);
 564   output += 4;
 565   vst1_s16(output, out[12]);
 566   output += 4;
 567   vst1_s16(output, out[13]);
 568   output += 4;
 569   vst1_s16(output, out[14]);
 570   output += 4;
 571   vst1_s16(output, out[15]);
 572 }
 573
 574 static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
 575                                           uint8_t *dest, int stride) {
 576   const int16x8_t cospis0 = vld1q_s16(kCospi);
 577   const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
 578   const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
 579   const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
 580   const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
 581   const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
 582   const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
 583   const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
 584   const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
 585   int16x4_t ind[8];
 586   int16x8_t in[4], step1[16], step2[16], out[16];
 587
 588   // Load input (4x8)
 589   ind[0] = vld1_s16(input);
 590   input += 4;
 591   ind[1] = vld1_s16(input);
 592   input += 4;
 593   ind[2] = vld1_s16(input);
 594   input += 4;
 595   ind[3] = vld1_s16(input);
 596   input += 4;
 597   ind[4] = vld1_s16(input);
 598   input += 4;
 599   ind[5] = vld1_s16(input);
 600   input += 4;
 601   ind[6] = vld1_s16(input);
 602   input += 4;
 603   ind[7] = vld1_s16(input);
 604
 605   // Transpose
 606   transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6],
 607                     ind[7], &in[0], &in[1], &in[2], &in[3]);
 608
 609   // stage 1
 610   step1[0] = in[0 / 2];
 611   step1[4] = in[4 / 2];
 612   step1[8] = in[2 / 2];
 613   step1[12] = in[6 / 2];
 614
 615   // stage 2
 616   step2[0] = step1[0];
 617   step2[4] = step1[4];
 618   step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
 619   step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
 620   step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
 621   step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
 622
 623   // stage 3
 624   step1[0] = step2[0];
 625   step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
 626   step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
 627   step1[8] = step2[8];
 628   step1[9] = step2[8];
 629   step1[10] = step2[11];
 630   step1[11] = step2[11];
 631   step1[12] = step2[12];
 632   step1[13] = step2[12];
 633   step1[14] = step2[15];
 634   step1[15] = step2[15];
 635
 636   // stage 4
 637   step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
 638   step2[4] = step1[4];
 639   step2[5] = step1[4];
 640   step2[6] = step1[7];
 641   step2[7] = step1[7];
 642   step2[8] = step1[8];
 643   idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
 644                     &step2[14]);
 645   idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
 646                         &step2[10]);
 647   step2[11] = step1[11];
 648   step2[12] = step1[12];
 649   step2[15] = step1[15];
 650
 651   // stage 5
 652   step1[0] = step2[0];
 653   step1[1] = step2[1];
 654   step1[2] = step2[1];
 655   step1[3] = step2[0];
 656   step1[4] = step2[4];
 657   idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
 658   step1[7] = step2[7];
 659   step1[8] = vaddq_s16(step2[8], step2[11]);
 660   step1[9] = vaddq_s16(step2[9], step2[10]);
 661   step1[10] = vsubq_s16(step2[9], step2[10]);
 662   step1[11] = vsubq_s16(step2[8], step2[11]);
 663   step1[12] = vsubq_s16(step2[15], step2[12]);
 664   step1[13] = vsubq_s16(step2[14], step2[13]);
 665   step1[14] = vaddq_s16(step2[14], step2[13]);
 666   step1[15] = vaddq_s16(step2[15], step2[12]);
 667
 668   // stage 6
 669   step2[0] = vaddq_s16(step1[0], step1[7]);
 670   step2[1] = vaddq_s16(step1[1], step1[6]);
 671   step2[2] = vaddq_s16(step1[2], step1[5]);
 672   step2[3] = vaddq_s16(step1[3], step1[4]);
 673   step2[4] = vsubq_s16(step1[3], step1[4]);
 674   step2[5] = vsubq_s16(step1[2], step1[5]);
 675   step2[6] = vsubq_s16(step1[1], step1[6]);
 676   step2[7] = vsubq_s16(step1[0], step1[7]);
 677   idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
 678                      &step2[13]);
 679   idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
 680                      &step2[12]);
 681   step2[8] = step1[8];
 682   step2[9] = step1[9];
 683   step2[14] = step1[14];
 684   step2[15] = step1[15];
 685
 686   // stage 7
 687   out[0] = vaddq_s16(step2[0], step2[15]);
 688   out[1] = vaddq_s16(step2[1], step2[14]);
 689   out[2] = vaddq_s16(step2[2], step2[13]);
 690   out[3] = vaddq_s16(step2[3], step2[12]);
 691   out[4] = vaddq_s16(step2[4], step2[11]);
 692   out[5] = vaddq_s16(step2[5], step2[10]);
 693   out[6] = vaddq_s16(step2[6], step2[9]);
 694   out[7] = vaddq_s16(step2[7], step2[8]);
 695   out[8] = vsubq_s16(step2[7], step2[8]);
 696   out[9] = vsubq_s16(step2[6], step2[9]);
 697   out[10] = vsubq_s16(step2[5], step2[10]);
 698   out[11] = vsubq_s16(step2[4], step2[11]);
 699   out[12] = vsubq_s16(step2[3], step2[12]);
 700   out[13] = vsubq_s16(step2[2], step2[13]);
 701   out[14] = vsubq_s16(step2[1], step2[14]);
 702   out[15] = vsubq_s16(step2[0], step2[15]);
 703
 704   if (output) {
 705     // pass 1: save the result into output
 706     vst1q_s16(output, out[0]);
 707     output += 16;
 708     vst1q_s16(output, out[1]);
 709     output += 16;
 710     vst1q_s16(output, out[2]);
 711     output += 16;
 712     vst1q_s16(output, out[3]);
 713     output += 16;
 714     vst1q_s16(output, out[4]);
 715     output += 16;
 716     vst1q_s16(output, out[5]);
 717     output += 16;
 718     vst1q_s16(output, out[6]);
 719     output += 16;
 720     vst1q_s16(output, out[7]);
 721     output += 16;
 722     vst1q_s16(output, out[8]);
 723     output += 16;
 724     vst1q_s16(output, out[9]);
 725     output += 16;
 726     vst1q_s16(output, out[10]);
 727     output += 16;
 728     vst1q_s16(output, out[11]);
 729     output += 16;
 730     vst1q_s16(output, out[12]);
 731     output += 16;
 732     vst1q_s16(output, out[13]);
 733     output += 16;
 734     vst1q_s16(output, out[14]);
 735     output += 16;
 736     vst1q_s16(output, out[15]);
 737   } else {
 738     // pass 2: add the result to dest.
 739     idct16x16_add8x1(out[0], &dest, stride);
 740     idct16x16_add8x1(out[1], &dest, stride);
 741     idct16x16_add8x1(out[2], &dest, stride);
 742     idct16x16_add8x1(out[3], &dest, stride);
 743     idct16x16_add8x1(out[4], &dest, stride);
 744     idct16x16_add8x1(out[5], &dest, stride);
 745     idct16x16_add8x1(out[6], &dest, stride);
 746     idct16x16_add8x1(out[7], &dest, stride);
 747     idct16x16_add8x1(out[8], &dest, stride);
 748     idct16x16_add8x1(out[9], &dest, stride);
 749     idct16x16_add8x1(out[10], &dest, stride);
 750     idct16x16_add8x1(out[11], &dest, stride);
 751     idct16x16_add8x1(out[12], &dest, stride);
 752     idct16x16_add8x1(out[13], &dest, stride);
 753     idct16x16_add8x1(out[14], &dest, stride);
 754     idct16x16_add8x1(out[15], &dest, stride);
 755   }
 756 }
 757
 758 void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
 759                                 int stride) {
 760   int16_t row_idct_output[16 * 16];
 761
 762   // pass 1
 763   // Parallel idct on the upper 8 rows
 764   idct16x16_256_add_half1d(input, row_idct_output, dest, stride);
 765
 766   // Parallel idct on the lower 8 rows
 767   idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride);
 768
 769   // pass 2
 770   // Parallel idct to get the left 8 columns
 771   idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride);
 772
 773   // Parallel idct to get the right 8 columns
 774   idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride);
 775 }
 776
 777 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
 778                                int stride) {
 779   int16_t row_idct_output[4 * 16];
 780
 781   // pass 1
 782   // Parallel idct on the upper 8 rows
 783   idct16x16_10_add_half1d_pass1(input, row_idct_output);
 784
 785   // pass 2
 786   // Parallel idct to get the left 8 columns
 787   idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride);
 788
 789   // Parallel idct to get the right 8 columns
 790   idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
 791                                 stride);
 792 }