// -----------------------------------------------------------------------------
-void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- uint16x4_t a0, a1, a2, a3, d0, d1;
- (void)left;
- (void)bd;
-
- a0 = vld1_u16(above + 0);
- a1 = vld1_u16(above + 1);
- a2 = vld1_u16(above + 2);
- a3 = vld1_dup_u16(above + 3);
-
- d0 = vrhadd_u16(a0, a1);
- d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
-
- vst1_u16(dst + 0 * stride, d0);
- vst1_u16(dst + 1 * stride, d1);
- vst1_u16(dst + 2 * stride, vext_u16(d0, a3, 1));
- vst1_u16(dst + 3 * stride, vext_u16(d1, a3, 1));
-}
-
-void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- uint16x8_t a0, a1, a2, a7, d0, d1;
- (void)left;
- (void)bd;
-
- a0 = vld1q_u16(above + 0);
- a1 = vld1q_u16(above + 1);
- a2 = vld1q_u16(above + 2);
- a7 = vld1q_dup_u16(above + 7);
-
- d0 = vrhaddq_u16(a0, a1);
- d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
-
- vst1q_u16(dst + 0 * stride, d0);
- vst1q_u16(dst + 1 * stride, d1);
- vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 1));
- vst1q_u16(dst + 3 * stride, vextq_u16(d1, a7, 1));
- vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 2));
- vst1q_u16(dst + 5 * stride, vextq_u16(d1, a7, 2));
- vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 3));
- vst1q_u16(dst + 7 * stride, vextq_u16(d1, a7, 3));
-}
-
-void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0_lo, d0_hi, d1_lo, d1_hi;
- (void)left;
- (void)bd;
-
- a0 = vld1q_u16(above + 0);
- a1 = vld1q_u16(above + 1);
- a2 = vld1q_u16(above + 2);
- a8 = vld1q_u16(above + 8);
- a9 = vld1q_u16(above + 9);
- a10 = vld1q_u16(above + 10);
- a15 = vld1q_dup_u16(above + 15);
-
- d0_lo = vrhaddq_u16(a0, a1);
- d0_hi = vrhaddq_u16(a8, a9);
- d1_lo = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
- d1_hi = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
-
- vst1q_u16(dst + 0 * stride + 0, d0_lo);
- vst1q_u16(dst + 0 * stride + 8, d0_hi);
- vst1q_u16(dst + 1 * stride + 0, d1_lo);
- vst1q_u16(dst + 1 * stride + 8, d1_hi);
- vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0_lo, d0_hi, 1));
- vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_hi, a15, 1));
- vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1_lo, d1_hi, 1));
- vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_hi, a15, 1));
- vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0_lo, d0_hi, 2));
- vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_hi, a15, 2));
- vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1_lo, d1_hi, 2));
- vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_hi, a15, 2));
- vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0_lo, d0_hi, 3));
- vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_hi, a15, 3));
- vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1_lo, d1_hi, 3));
- vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_hi, a15, 3));
- vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0_lo, d0_hi, 4));
- vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_hi, a15, 4));
- vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1_lo, d1_hi, 4));
- vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_hi, a15, 4));
- vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0_lo, d0_hi, 5));
- vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_hi, a15, 5));
- vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1_lo, d1_hi, 5));
- vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_hi, a15, 5));
- vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0_lo, d0_hi, 6));
- vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_hi, a15, 6));
- vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1_lo, d1_hi, 6));
- vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_hi, a15, 6));
- vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0_lo, d0_hi, 7));
- vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_hi, a15, 7));
- vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1_lo, d1_hi, 7));
- vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_hi, a15, 7));
-}
-
-void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
- d1[4];
- (void)left;
- (void)bd;
-
- a0 = vld1q_u16(above + 0);
- a1 = vld1q_u16(above + 1);
- a2 = vld1q_u16(above + 2);
- a8 = vld1q_u16(above + 8);
- a9 = vld1q_u16(above + 9);
- a10 = vld1q_u16(above + 10);
- a16 = vld1q_u16(above + 16);
- a17 = vld1q_u16(above + 17);
- a18 = vld1q_u16(above + 18);
- a24 = vld1q_u16(above + 24);
- a25 = vld1q_u16(above + 25);
- a26 = vld1q_u16(above + 26);
- a31 = vld1q_dup_u16(above + 31);
-
- d0[0] = vrhaddq_u16(a0, a1);
- d0[1] = vrhaddq_u16(a8, a9);
- d0[2] = vrhaddq_u16(a16, a17);
- d0[3] = vrhaddq_u16(a24, a25);
- d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
- d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
- d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
- d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
-
- vst1q_u16(dst + 0 * stride + 0, d0[0]);
- vst1q_u16(dst + 0 * stride + 8, d0[1]);
- vst1q_u16(dst + 0 * stride + 16, d0[2]);
- vst1q_u16(dst + 0 * stride + 24, d0[3]);
- vst1q_u16(dst + 1 * stride + 0, d1[0]);
- vst1q_u16(dst + 1 * stride + 8, d1[1]);
- vst1q_u16(dst + 1 * stride + 16, d1[2]);
- vst1q_u16(dst + 1 * stride + 24, d1[3]);
-
- vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
- vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
- vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
- vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[3], a31, 1));
- vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
- vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
- vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
- vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[3], a31, 1));
-
- vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
- vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
- vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
- vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[3], a31, 2));
- vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
- vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
- vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
- vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[3], a31, 2));
-
- vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
- vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
- vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
- vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[3], a31, 3));
- vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
- vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
- vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
- vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[3], a31, 3));
-
- vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
- vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
- vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
- vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[3], a31, 4));
- vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
- vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
- vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
- vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[3], a31, 4));
-
- vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
- vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
- vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
- vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[3], a31, 5));
- vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
- vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
- vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
- vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[3], a31, 5));
-
- vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
- vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
- vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
- vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[3], a31, 6));
- vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
- vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
- vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
- vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[3], a31, 6));
-
- vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
- vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
- vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
- vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[3], a31, 7));
- vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
- vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
- vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
- vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[3], a31, 7));
-
- vst1q_u16(dst + 16 * stride + 0, d0[1]);
- vst1q_u16(dst + 16 * stride + 8, d0[2]);
- vst1q_u16(dst + 16 * stride + 16, d0[3]);
- vst1q_u16(dst + 16 * stride + 24, a31);
- vst1q_u16(dst + 17 * stride + 0, d1[1]);
- vst1q_u16(dst + 17 * stride + 8, d1[2]);
- vst1q_u16(dst + 17 * stride + 16, d1[3]);
- vst1q_u16(dst + 17 * stride + 24, a31);
-
- vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
- vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
- vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[3], a31, 1));
- vst1q_u16(dst + 18 * stride + 24, a31);
- vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
- vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
- vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[3], a31, 1));
- vst1q_u16(dst + 19 * stride + 24, a31);
-
- vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
- vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
- vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[3], a31, 2));
- vst1q_u16(dst + 20 * stride + 24, a31);
- vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
- vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
- vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[3], a31, 2));
- vst1q_u16(dst + 21 * stride + 24, a31);
-
- vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
- vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
- vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[3], a31, 3));
- vst1q_u16(dst + 22 * stride + 24, a31);
- vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
- vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
- vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[3], a31, 3));
- vst1q_u16(dst + 23 * stride + 24, a31);
-
- vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
- vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
- vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[3], a31, 4));
- vst1q_u16(dst + 24 * stride + 24, a31);
- vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
- vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
- vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[3], a31, 4));
- vst1q_u16(dst + 25 * stride + 24, a31);
-
- vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
- vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
- vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[3], a31, 5));
- vst1q_u16(dst + 26 * stride + 24, a31);
- vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
- vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
- vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[3], a31, 5));
- vst1q_u16(dst + 27 * stride + 24, a31);
-
- vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
- vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
- vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[3], a31, 6));
- vst1q_u16(dst + 28 * stride + 24, a31);
- vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
- vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
- vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[3], a31, 6));
- vst1q_u16(dst + 29 * stride + 24, a31);
-
- vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
- vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
- vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[3], a31, 7));
- vst1q_u16(dst + 30 * stride + 24, a31);
- vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
- vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
- vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[3], a31, 7));
- vst1q_u16(dst + 31 * stride + 24, a31);
-}
-
-// -----------------------------------------------------------------------------
-
void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
const uint16_t *above,
const uint16_t *left, int bd) {