granicus.if.org Git - libvpx/blob - vpx_dsp/arm/highbd_idct8x8_add_neon.c

   1 /*
   2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <arm_neon.h>
  12
  13 #include "./vpx_dsp_rtcd.h"
  14 #include "vpx_dsp/arm/idct_neon.h"
  15 #include "vpx_dsp/arm/transpose_neon.h"
  16 #include "vpx_dsp/inv_txfm.h"
  17
  18 static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
  19                                                    const int stride,
  20                                                    const int16x8_t res,
  21                                                    const int16x8_t max) {
  22   const uint16x8_t a = vld1q_u16(*dest);
  23   const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
  24   const int16x8_t c = vminq_s16(b, max);
  25   vst1q_u16(*dest, vreinterpretq_u16_s16(c));
  26   *dest += stride;
  27 }
  28
  29 static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
  30                                                    const int stride,
  31                                                    const int16x8_t res) {
  32   const uint16x8_t a = vld1q_u16(*dest);
  33   const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
  34   const uint16x8_t c = vqshluq_n_s16(b, 0);
  35   vst1q_u16(*dest, c);
  36   *dest += stride;
  37 }
  38
  39 void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
  40                                    int stride, int bd) {
  41   const tran_low_t out0 =
  42       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
  43   const tran_low_t out1 =
  44       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
  45   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
  46   const int16x8_t dc = vdupq_n_s16(a1);
  47
  48   if (a1 >= 0) {
  49     const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
  50     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  51     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  52     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  53     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  54     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  55     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  56     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  57     highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
  58   } else {
  59     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  60     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  61     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  62     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  63     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  64     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  65     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  66     highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
  67   }
  68 }
  69
  70 static INLINE void idct8x8_12_half1d_bd10(
  71     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
  72     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
  73     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
  74     int32x4_t *const io7) {
  75   int32x4_t step1[8], step2[8];
  76
  77   transpose_s32_4x4(io0, io1, io2, io3);
  78
  79   // stage 1
  80   step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
  81   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
  82   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
  83   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
  84   step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
  85   step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
  86   step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
  87   step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
  88
  89   // stage 2
  90   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
  91   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
  92   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
  93   step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
  94   step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
  95   step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
  96
  97   step2[4] = vaddq_s32(step1[4], step1[5]);
  98   step2[5] = vsubq_s32(step1[4], step1[5]);
  99   step2[6] = vsubq_s32(step1[7], step1[6]);
 100   step2[7] = vaddq_s32(step1[7], step1[6]);
 101
 102   // stage 3
 103   step1[0] = vaddq_s32(step2[1], step2[3]);
 104   step1[1] = vaddq_s32(step2[1], step2[2]);
 105   step1[2] = vsubq_s32(step2[1], step2[2]);
 106   step1[3] = vsubq_s32(step2[1], step2[3]);
 107
 108   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
 109   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
 110   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
 111   step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
 112   step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
 113
 114   // stage 4
 115   *io0 = vaddq_s32(step1[0], step2[7]);
 116   *io1 = vaddq_s32(step1[1], step1[6]);
 117   *io2 = vaddq_s32(step1[2], step1[5]);
 118   *io3 = vaddq_s32(step1[3], step2[4]);
 119   *io4 = vsubq_s32(step1[3], step2[4]);
 120   *io5 = vsubq_s32(step1[2], step1[5]);
 121   *io6 = vsubq_s32(step1[1], step1[6]);
 122   *io7 = vsubq_s32(step1[0], step2[7]);
 123 }
 124
 125 static INLINE void idct8x8_12_half1d_bd12(
 126     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
 127     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
 128     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
 129     int32x4_t *const io7) {
 130   int32x2_t input_1l, input_1h, input_3l, input_3h;
 131   int32x2_t step1l[2], step1h[2];
 132   int32x4_t step1[8], step2[8];
 133   int64x2_t t64[8];
 134   int32x2_t t32[8];
 135
 136   transpose_s32_4x4(io0, io1, io2, io3);
 137
 138   // stage 1
 139   input_1l = vget_low_s32(*io1);
 140   input_1h = vget_high_s32(*io1);
 141   input_3l = vget_low_s32(*io3);
 142   input_3h = vget_high_s32(*io3);
 143   step1l[0] = vget_low_s32(*io0);
 144   step1h[0] = vget_high_s32(*io0);
 145   step1l[1] = vget_low_s32(*io2);
 146   step1h[1] = vget_high_s32(*io2);
 147
 148   t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
 149   t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
 150   t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
 151   t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
 152   t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
 153   t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
 154   t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
 155   t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
 156   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
 157   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
 158   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
 159   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
 160   t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
 161   t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
 162   t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
 163   t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
 164   step1[4] = vcombine_s32(t32[0], t32[1]);
 165   step1[5] = vcombine_s32(t32[2], t32[3]);
 166   step1[6] = vcombine_s32(t32[4], t32[5]);
 167   step1[7] = vcombine_s32(t32[6], t32[7]);
 168
 169   // stage 2
 170   t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
 171   t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
 172   t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
 173   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
 174   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
 175   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
 176   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
 177   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
 178   t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
 179   t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
 180   t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
 181   t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
 182   step2[1] = vcombine_s32(t32[2], t32[3]);
 183   step2[2] = vcombine_s32(t32[4], t32[5]);
 184   step2[3] = vcombine_s32(t32[6], t32[7]);
 185
 186   step2[4] = vaddq_s32(step1[4], step1[5]);
 187   step2[5] = vsubq_s32(step1[4], step1[5]);
 188   step2[6] = vsubq_s32(step1[7], step1[6]);
 189   step2[7] = vaddq_s32(step1[7], step1[6]);
 190
 191   // stage 3
 192   step1[0] = vaddq_s32(step2[1], step2[3]);
 193   step1[1] = vaddq_s32(step2[1], step2[2]);
 194   step1[2] = vsubq_s32(step2[1], step2[2]);
 195   step1[3] = vsubq_s32(step2[1], step2[3]);
 196
 197   t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
 198   t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
 199   t64[0] =
 200       vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
 201   t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
 202                           vget_high_s32(cospis0), 0);
 203   t64[2] =
 204       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
 205   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
 206                           vget_high_s32(cospis0), 0);
 207   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
 208   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
 209   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
 210   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
 211   step1[5] = vcombine_s32(t32[0], t32[1]);
 212   step1[6] = vcombine_s32(t32[2], t32[3]);
 213
 214   // stage 4
 215   *io0 = vaddq_s32(step1[0], step2[7]);
 216   *io1 = vaddq_s32(step1[1], step1[6]);
 217   *io2 = vaddq_s32(step1[2], step1[5]);
 218   *io3 = vaddq_s32(step1[3], step2[4]);
 219   *io4 = vsubq_s32(step1[3], step2[4]);
 220   *io5 = vsubq_s32(step1[2], step1[5]);
 221   *io6 = vsubq_s32(step1[1], step1[6]);
 222   *io7 = vsubq_s32(step1[0], step2[7]);
 223 }
 224
 225 static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
 226                                  int16x8_t a3, int16x8_t a4, int16x8_t a5,
 227                                  int16x8_t a6, int16x8_t a7, uint16_t *dest,
 228                                  const int stride, const int bd) {
 229   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
 230   const uint16_t *dst = dest;
 231   uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
 232   uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
 233   int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
 234
 235   d0 = vld1q_u16(dst);
 236   dst += stride;
 237   d1 = vld1q_u16(dst);
 238   dst += stride;
 239   d2 = vld1q_u16(dst);
 240   dst += stride;
 241   d3 = vld1q_u16(dst);
 242   dst += stride;
 243   d4 = vld1q_u16(dst);
 244   dst += stride;
 245   d5 = vld1q_u16(dst);
 246   dst += stride;
 247   d6 = vld1q_u16(dst);
 248   dst += stride;
 249   d7 = vld1q_u16(dst);
 250
 251   d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
 252   d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
 253   d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
 254   d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
 255   d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
 256   d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
 257   d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
 258   d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
 259
 260   d0_s16 = vminq_s16(d0_s16, max);
 261   d1_s16 = vminq_s16(d1_s16, max);
 262   d2_s16 = vminq_s16(d2_s16, max);
 263   d3_s16 = vminq_s16(d3_s16, max);
 264   d4_s16 = vminq_s16(d4_s16, max);
 265   d5_s16 = vminq_s16(d5_s16, max);
 266   d6_s16 = vminq_s16(d6_s16, max);
 267   d7_s16 = vminq_s16(d7_s16, max);
 268   d0_u16 = vqshluq_n_s16(d0_s16, 0);
 269   d1_u16 = vqshluq_n_s16(d1_s16, 0);
 270   d2_u16 = vqshluq_n_s16(d2_s16, 0);
 271   d3_u16 = vqshluq_n_s16(d3_s16, 0);
 272   d4_u16 = vqshluq_n_s16(d4_s16, 0);
 273   d5_u16 = vqshluq_n_s16(d5_s16, 0);
 274   d6_u16 = vqshluq_n_s16(d6_s16, 0);
 275   d7_u16 = vqshluq_n_s16(d7_s16, 0);
 276
 277   vst1q_u16(dest, d0_u16);
 278   dest += stride;
 279   vst1q_u16(dest, d1_u16);
 280   dest += stride;
 281   vst1q_u16(dest, d2_u16);
 282   dest += stride;
 283   vst1q_u16(dest, d3_u16);
 284   dest += stride;
 285   vst1q_u16(dest, d4_u16);
 286   dest += stride;
 287   vst1q_u16(dest, d5_u16);
 288   dest += stride;
 289   vst1q_u16(dest, d6_u16);
 290   dest += stride;
 291   vst1q_u16(dest, d7_u16);
 292 }
 293
 294 void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
 295                                     int stride, int bd) {
 296   int32x4_t a0 = vld1q_s32(input);
 297   int32x4_t a1 = vld1q_s32(input + 8);
 298   int32x4_t a2 = vld1q_s32(input + 16);
 299   int32x4_t a3 = vld1q_s32(input + 24);
 300   int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
 301
 302   if (bd == 8) {
 303     const int16x8_t cospis = vld1q_s16(kCospi);
 304     const int16x8_t cospisd = vaddq_s16(cospis, cospis);
 305     const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
 306     const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
 307     const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
 308     int16x4_t b0 = vmovn_s32(a0);
 309     int16x4_t b1 = vmovn_s32(a1);
 310     int16x4_t b2 = vmovn_s32(a2);
 311     int16x4_t b3 = vmovn_s32(a3);
 312     int16x4_t b4, b5, b6, b7;
 313
 314     idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
 315                          &b5, &b6, &b7);
 316     idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
 317                          b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
 318     c0 = vrshrq_n_s16(c0, 5);
 319     c1 = vrshrq_n_s16(c1, 5);
 320     c2 = vrshrq_n_s16(c2, 5);
 321     c3 = vrshrq_n_s16(c3, 5);
 322     c4 = vrshrq_n_s16(c4, 5);
 323     c5 = vrshrq_n_s16(c5, 5);
 324     c6 = vrshrq_n_s16(c6, 5);
 325     c7 = vrshrq_n_s16(c7, 5);
 326   } else {
 327     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
 328     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
 329     int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
 330
 331     if (bd == 10) {
 332       idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
 333                              &a6, &a7);
 334       idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
 335                              &a10, &a11);
 336       idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
 337                              &a14, &a15);
 338     } else {
 339       idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
 340                              &a6, &a7);
 341       idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
 342                              &a10, &a11);
 343       idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
 344                              &a14, &a15);
 345     }
 346     c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
 347     c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
 348     c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
 349     c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
 350     c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
 351     c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
 352     c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
 353     c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
 354   }
 355   highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
 356 }
 357
 358 static INLINE void idct8x8_64_half1d_bd10(
 359     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
 360     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
 361     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
 362     int32x4_t *const io7) {
 363   int32x4_t step1[8], step2[8];
 364
 365   transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
 366
 367   // stage 1
 368   step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
 369   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
 370   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
 371   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
 372
 373   step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
 374   step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
 375   step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
 376   step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
 377
 378   step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
 379   step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
 380   step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
 381   step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
 382
 383   // stage 2
 384   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
 385   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
 386   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
 387
 388   step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
 389   step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
 390   step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
 391   step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
 392
 393   step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
 394   step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
 395   step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
 396   step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
 397
 398   step2[4] = vaddq_s32(step1[4], step1[5]);
 399   step2[5] = vsubq_s32(step1[4], step1[5]);
 400   step2[6] = vsubq_s32(step1[7], step1[6]);
 401   step2[7] = vaddq_s32(step1[7], step1[6]);
 402
 403   // stage 3
 404   step1[0] = vaddq_s32(step2[0], step2[3]);
 405   step1[1] = vaddq_s32(step2[1], step2[2]);
 406   step1[2] = vsubq_s32(step2[1], step2[2]);
 407   step1[3] = vsubq_s32(step2[0], step2[3]);
 408
 409   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
 410   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
 411   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
 412   step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
 413   step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
 414
 415   // stage 4
 416   *io0 = vaddq_s32(step1[0], step2[7]);
 417   *io1 = vaddq_s32(step1[1], step1[6]);
 418   *io2 = vaddq_s32(step1[2], step1[5]);
 419   *io3 = vaddq_s32(step1[3], step2[4]);
 420   *io4 = vsubq_s32(step1[3], step2[4]);
 421   *io5 = vsubq_s32(step1[2], step1[5]);
 422   *io6 = vsubq_s32(step1[1], step1[6]);
 423   *io7 = vsubq_s32(step1[0], step2[7]);
 424 }
 425
 426 static INLINE void idct8x8_64_half1d_bd12(
 427     const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
 428     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
 429     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
 430     int32x4_t *const io7) {
 431   int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
 432       input_7l, input_7h;
 433   int32x2_t step1l[4], step1h[4];
 434   int32x4_t step1[8], step2[8];
 435   int64x2_t t64[8];
 436   int32x2_t t32[8];
 437
 438   transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
 439
 440   // stage 1
 441   input_1l = vget_low_s32(*io1);
 442   input_1h = vget_high_s32(*io1);
 443   input_3l = vget_low_s32(*io3);
 444   input_3h = vget_high_s32(*io3);
 445   input_5l = vget_low_s32(*io5);
 446   input_5h = vget_high_s32(*io5);
 447   input_7l = vget_low_s32(*io7);
 448   input_7h = vget_high_s32(*io7);
 449   step1l[0] = vget_low_s32(*io0);
 450   step1h[0] = vget_high_s32(*io0);
 451   step1l[1] = vget_low_s32(*io2);
 452   step1h[1] = vget_high_s32(*io2);
 453   step1l[2] = vget_low_s32(*io4);
 454   step1h[2] = vget_high_s32(*io4);
 455   step1l[3] = vget_low_s32(*io6);
 456   step1h[3] = vget_high_s32(*io6);
 457
 458   t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
 459   t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
 460   t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
 461   t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
 462   t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
 463   t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
 464   t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
 465   t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
 466   t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
 467   t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
 468   t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
 469   t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
 470   t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
 471   t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
 472   t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
 473   t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
 474   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
 475   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
 476   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
 477   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
 478   t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
 479   t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
 480   t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
 481   t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
 482   step1[4] = vcombine_s32(t32[0], t32[1]);
 483   step1[5] = vcombine_s32(t32[2], t32[3]);
 484   step1[6] = vcombine_s32(t32[4], t32[5]);
 485   step1[7] = vcombine_s32(t32[6], t32[7]);
 486
 487   // stage 2
 488   t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
 489   t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
 490   t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
 491   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
 492   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
 493   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
 494   t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
 495   t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
 496   t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
 497   t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
 498   t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
 499   t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
 500   t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
 501   t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
 502   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
 503   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
 504   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
 505   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
 506   t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
 507   t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
 508   t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
 509   t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
 510   step2[0] = vcombine_s32(t32[0], t32[1]);
 511   step2[1] = vcombine_s32(t32[2], t32[3]);
 512   step2[2] = vcombine_s32(t32[4], t32[5]);
 513   step2[3] = vcombine_s32(t32[6], t32[7]);
 514
 515   step2[4] = vaddq_s32(step1[4], step1[5]);
 516   step2[5] = vsubq_s32(step1[4], step1[5]);
 517   step2[6] = vsubq_s32(step1[7], step1[6]);
 518   step2[7] = vaddq_s32(step1[7], step1[6]);
 519
 520   // stage 3
 521   step1[0] = vaddq_s32(step2[0], step2[3]);
 522   step1[1] = vaddq_s32(step2[1], step2[2]);
 523   step1[2] = vsubq_s32(step2[1], step2[2]);
 524   step1[3] = vsubq_s32(step2[0], step2[3]);
 525
 526   t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
 527   t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
 528   t64[0] =
 529       vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
 530   t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
 531                           vget_high_s32(cospis0), 0);
 532   t64[2] =
 533       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
 534   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
 535                           vget_high_s32(cospis0), 0);
 536   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
 537   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
 538   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
 539   t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
 540   step1[5] = vcombine_s32(t32[0], t32[1]);
 541   step1[6] = vcombine_s32(t32[2], t32[3]);
 542
 543   // stage 4
 544   *io0 = vaddq_s32(step1[0], step2[7]);
 545   *io1 = vaddq_s32(step1[1], step1[6]);
 546   *io2 = vaddq_s32(step1[2], step1[5]);
 547   *io3 = vaddq_s32(step1[3], step2[4]);
 548   *io4 = vsubq_s32(step1[3], step2[4]);
 549   *io5 = vsubq_s32(step1[2], step1[5]);
 550   *io6 = vsubq_s32(step1[1], step1[6]);
 551   *io7 = vsubq_s32(step1[0], step2[7]);
 552 }
 553
 554 void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
 555                                     int stride, int bd) {
 556   int32x4_t a0 = vld1q_s32(input);
 557   int32x4_t a1 = vld1q_s32(input + 4);
 558   int32x4_t a2 = vld1q_s32(input + 8);
 559   int32x4_t a3 = vld1q_s32(input + 12);
 560   int32x4_t a4 = vld1q_s32(input + 16);
 561   int32x4_t a5 = vld1q_s32(input + 20);
 562   int32x4_t a6 = vld1q_s32(input + 24);
 563   int32x4_t a7 = vld1q_s32(input + 28);
 564   int32x4_t a8 = vld1q_s32(input + 32);
 565   int32x4_t a9 = vld1q_s32(input + 36);
 566   int32x4_t a10 = vld1q_s32(input + 40);
 567   int32x4_t a11 = vld1q_s32(input + 44);
 568   int32x4_t a12 = vld1q_s32(input + 48);
 569   int32x4_t a13 = vld1q_s32(input + 52);
 570   int32x4_t a14 = vld1q_s32(input + 56);
 571   int32x4_t a15 = vld1q_s32(input + 60);
 572   int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
 573
 574   if (bd == 8) {
 575     const int16x8_t cospis = vld1q_s16(kCospi);
 576     const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
 577     const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
 578     int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
 579     int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
 580     int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
 581     int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
 582     int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
 583     int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
 584     int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
 585     int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
 586
 587     idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
 588     idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
 589
 590     c0 = vrshrq_n_s16(b0, 5);
 591     c1 = vrshrq_n_s16(b1, 5);
 592     c2 = vrshrq_n_s16(b2, 5);
 593     c3 = vrshrq_n_s16(b3, 5);
 594     c4 = vrshrq_n_s16(b4, 5);
 595     c5 = vrshrq_n_s16(b5, 5);
 596     c6 = vrshrq_n_s16(b6, 5);
 597     c7 = vrshrq_n_s16(b7, 5);
 598   } else {
 599     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
 600     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
 601
 602     if (bd == 10) {
 603       idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
 604                              &a6, &a7);
 605       idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
 606                              &a14, &a15);
 607       idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
 608                              &a3, &a11);
 609       idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
 610                              &a7, &a15);
 611     } else {
 612       idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
 613                              &a6, &a7);
 614       idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
 615                              &a14, &a15);
 616       idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
 617                              &a3, &a11);
 618       idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
 619                              &a7, &a15);
 620     }
 621     c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
 622     c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
 623     c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
 624     c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
 625     c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
 626     c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
 627     c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
 628     c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
 629   }
 630   highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
 631 }