granicus.if.org Git - libvpx/blob - vpx_dsp/ppc/intrapred_vsx.c

   1 /*
   2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include "./vpx_dsp_rtcd.h"
  12 #include "vpx_dsp/ppc/types_vsx.h"
  13
  14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
  15                                const uint8_t *above, const uint8_t *left) {
  16   const uint8x16_t d = vec_vsx_ld(0, above);
  17   int i;
  18   (void)left;
  19
  20   for (i = 0; i < 16; i++, dst += stride) {
  21     vec_vsx_st(d, 0, dst);
  22   }
  23 }
  24
  25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
  26                                const uint8_t *above, const uint8_t *left) {
  27   const uint8x16_t d0 = vec_vsx_ld(0, above);
  28   const uint8x16_t d1 = vec_vsx_ld(16, above);
  29   int i;
  30   (void)left;
  31
  32   for (i = 0; i < 32; i++, dst += stride) {
  33     vec_vsx_st(d0, 0, dst);
  34     vec_vsx_st(d1, 16, dst);
  35   }
  36 }
  37
  38 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
  39
  40 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
  41                              const uint8_t *above, const uint8_t *left) {
  42   const uint8x16_t d = vec_vsx_ld(0, left);
  43   const uint8x16_t v0 = vec_splat(d, 0);
  44   const uint8x16_t v1 = vec_splat(d, 1);
  45   const uint8x16_t v2 = vec_splat(d, 2);
  46   const uint8x16_t v3 = vec_splat(d, 3);
  47
  48   (void)above;
  49
  50   vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
  51   dst += stride;
  52   vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
  53   dst += stride;
  54   vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
  55   dst += stride;
  56   vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
  57 }
  58
  59 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
  60                                const uint8_t *above, const uint8_t *left) {
  61   const uint8x16_t d = vec_vsx_ld(0, left);
  62   const uint8x16_t v0 = vec_splat(d, 0);
  63   const uint8x16_t v1 = vec_splat(d, 1);
  64   const uint8x16_t v2 = vec_splat(d, 2);
  65   const uint8x16_t v3 = vec_splat(d, 3);
  66
  67   const uint8x16_t v4 = vec_splat(d, 4);
  68   const uint8x16_t v5 = vec_splat(d, 5);
  69   const uint8x16_t v6 = vec_splat(d, 6);
  70   const uint8x16_t v7 = vec_splat(d, 7);
  71
  72   const uint8x16_t v8 = vec_splat(d, 8);
  73   const uint8x16_t v9 = vec_splat(d, 9);
  74   const uint8x16_t v10 = vec_splat(d, 10);
  75   const uint8x16_t v11 = vec_splat(d, 11);
  76
  77   const uint8x16_t v12 = vec_splat(d, 12);
  78   const uint8x16_t v13 = vec_splat(d, 13);
  79   const uint8x16_t v14 = vec_splat(d, 14);
  80   const uint8x16_t v15 = vec_splat(d, 15);
  81
  82   (void)above;
  83
  84   vec_vsx_st(v0, 0, dst);
  85   dst += stride;
  86   vec_vsx_st(v1, 0, dst);
  87   dst += stride;
  88   vec_vsx_st(v2, 0, dst);
  89   dst += stride;
  90   vec_vsx_st(v3, 0, dst);
  91   dst += stride;
  92   vec_vsx_st(v4, 0, dst);
  93   dst += stride;
  94   vec_vsx_st(v5, 0, dst);
  95   dst += stride;
  96   vec_vsx_st(v6, 0, dst);
  97   dst += stride;
  98   vec_vsx_st(v7, 0, dst);
  99   dst += stride;
 100   vec_vsx_st(v8, 0, dst);
 101   dst += stride;
 102   vec_vsx_st(v9, 0, dst);
 103   dst += stride;
 104   vec_vsx_st(v10, 0, dst);
 105   dst += stride;
 106   vec_vsx_st(v11, 0, dst);
 107   dst += stride;
 108   vec_vsx_st(v12, 0, dst);
 109   dst += stride;
 110   vec_vsx_st(v13, 0, dst);
 111   dst += stride;
 112   vec_vsx_st(v14, 0, dst);
 113   dst += stride;
 114   vec_vsx_st(v15, 0, dst);
 115 }
 116
 117 #define H_PREDICTOR_32(v) \
 118   vec_vsx_st(v, 0, dst);  \
 119   vec_vsx_st(v, 16, dst); \
 120   dst += stride
 121
 122 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 123                                const uint8_t *above, const uint8_t *left) {
 124   const uint8x16_t d0 = vec_vsx_ld(0, left);
 125   const uint8x16_t d1 = vec_vsx_ld(16, left);
 126
 127   const uint8x16_t v0_0 = vec_splat(d0, 0);
 128   const uint8x16_t v1_0 = vec_splat(d0, 1);
 129   const uint8x16_t v2_0 = vec_splat(d0, 2);
 130   const uint8x16_t v3_0 = vec_splat(d0, 3);
 131   const uint8x16_t v4_0 = vec_splat(d0, 4);
 132   const uint8x16_t v5_0 = vec_splat(d0, 5);
 133   const uint8x16_t v6_0 = vec_splat(d0, 6);
 134   const uint8x16_t v7_0 = vec_splat(d0, 7);
 135   const uint8x16_t v8_0 = vec_splat(d0, 8);
 136   const uint8x16_t v9_0 = vec_splat(d0, 9);
 137   const uint8x16_t v10_0 = vec_splat(d0, 10);
 138   const uint8x16_t v11_0 = vec_splat(d0, 11);
 139   const uint8x16_t v12_0 = vec_splat(d0, 12);
 140   const uint8x16_t v13_0 = vec_splat(d0, 13);
 141   const uint8x16_t v14_0 = vec_splat(d0, 14);
 142   const uint8x16_t v15_0 = vec_splat(d0, 15);
 143
 144   const uint8x16_t v0_1 = vec_splat(d1, 0);
 145   const uint8x16_t v1_1 = vec_splat(d1, 1);
 146   const uint8x16_t v2_1 = vec_splat(d1, 2);
 147   const uint8x16_t v3_1 = vec_splat(d1, 3);
 148   const uint8x16_t v4_1 = vec_splat(d1, 4);
 149   const uint8x16_t v5_1 = vec_splat(d1, 5);
 150   const uint8x16_t v6_1 = vec_splat(d1, 6);
 151   const uint8x16_t v7_1 = vec_splat(d1, 7);
 152   const uint8x16_t v8_1 = vec_splat(d1, 8);
 153   const uint8x16_t v9_1 = vec_splat(d1, 9);
 154   const uint8x16_t v10_1 = vec_splat(d1, 10);
 155   const uint8x16_t v11_1 = vec_splat(d1, 11);
 156   const uint8x16_t v12_1 = vec_splat(d1, 12);
 157   const uint8x16_t v13_1 = vec_splat(d1, 13);
 158   const uint8x16_t v14_1 = vec_splat(d1, 14);
 159   const uint8x16_t v15_1 = vec_splat(d1, 15);
 160
 161   (void)above;
 162
 163   H_PREDICTOR_32(v0_0);
 164   H_PREDICTOR_32(v1_0);
 165   H_PREDICTOR_32(v2_0);
 166   H_PREDICTOR_32(v3_0);
 167
 168   H_PREDICTOR_32(v4_0);
 169   H_PREDICTOR_32(v5_0);
 170   H_PREDICTOR_32(v6_0);
 171   H_PREDICTOR_32(v7_0);
 172
 173   H_PREDICTOR_32(v8_0);
 174   H_PREDICTOR_32(v9_0);
 175   H_PREDICTOR_32(v10_0);
 176   H_PREDICTOR_32(v11_0);
 177
 178   H_PREDICTOR_32(v12_0);
 179   H_PREDICTOR_32(v13_0);
 180   H_PREDICTOR_32(v14_0);
 181   H_PREDICTOR_32(v15_0);
 182
 183   H_PREDICTOR_32(v0_1);
 184   H_PREDICTOR_32(v1_1);
 185   H_PREDICTOR_32(v2_1);
 186   H_PREDICTOR_32(v3_1);
 187
 188   H_PREDICTOR_32(v4_1);
 189   H_PREDICTOR_32(v5_1);
 190   H_PREDICTOR_32(v6_1);
 191   H_PREDICTOR_32(v7_1);
 192
 193   H_PREDICTOR_32(v8_1);
 194   H_PREDICTOR_32(v9_1);
 195   H_PREDICTOR_32(v10_1);
 196   H_PREDICTOR_32(v11_1);
 197
 198   H_PREDICTOR_32(v12_1);
 199   H_PREDICTOR_32(v13_1);
 200   H_PREDICTOR_32(v14_1);
 201   H_PREDICTOR_32(v15_1);
 202 }
 203
 204 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
 205                               const uint8_t *above, const uint8_t *left) {
 206   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
 207   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
 208   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
 209   int16x8_t tmp, val;
 210   uint8x16_t d;
 211
 212   d = vec_vsx_ld(0, dst);
 213   tmp = unpack_to_s16_l(d);
 214   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
 215   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
 216   dst += stride;
 217
 218   d = vec_vsx_ld(0, dst);
 219   tmp = unpack_to_s16_l(d);
 220   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
 221   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
 222   dst += stride;
 223
 224   d = vec_vsx_ld(0, dst);
 225   tmp = unpack_to_s16_l(d);
 226   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
 227   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
 228   dst += stride;
 229
 230   d = vec_vsx_ld(0, dst);
 231   tmp = unpack_to_s16_l(d);
 232   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
 233   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
 234 }
 235
 236 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
 237                               const uint8_t *above, const uint8_t *left) {
 238   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
 239   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
 240   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
 241   int16x8_t tmp, val;
 242
 243   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 244   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
 245   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 246   dst += stride;
 247
 248   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 249   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
 250   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 251   dst += stride;
 252
 253   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 254   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
 255   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 256   dst += stride;
 257
 258   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 259   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
 260   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 261   dst += stride;
 262
 263   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 264   val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
 265   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 266   dst += stride;
 267
 268   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 269   val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
 270   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 271   dst += stride;
 272
 273   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 274   val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
 275   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 276   dst += stride;
 277
 278   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
 279   val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
 280   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 281 }
 282
 283 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
 284                               int16x8_t ah, int16x8_t al, int16x8_t tl) {
 285   int16x8_t vh, vl, ls;
 286
 287   ls = vec_splat(l, 0);
 288   vh = vec_sub(vec_add(ls, ah), tl);
 289   vl = vec_sub(vec_add(ls, al), tl);
 290   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 291   dst += stride;
 292
 293   ls = vec_splat(l, 1);
 294   vh = vec_sub(vec_add(ls, ah), tl);
 295   vl = vec_sub(vec_add(ls, al), tl);
 296   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 297   dst += stride;
 298
 299   ls = vec_splat(l, 2);
 300   vh = vec_sub(vec_add(ls, ah), tl);
 301   vl = vec_sub(vec_add(ls, al), tl);
 302   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 303   dst += stride;
 304
 305   ls = vec_splat(l, 3);
 306   vh = vec_sub(vec_add(ls, ah), tl);
 307   vl = vec_sub(vec_add(ls, al), tl);
 308   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 309   dst += stride;
 310
 311   ls = vec_splat(l, 4);
 312   vh = vec_sub(vec_add(ls, ah), tl);
 313   vl = vec_sub(vec_add(ls, al), tl);
 314   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 315   dst += stride;
 316
 317   ls = vec_splat(l, 5);
 318   vh = vec_sub(vec_add(ls, ah), tl);
 319   vl = vec_sub(vec_add(ls, al), tl);
 320   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 321   dst += stride;
 322
 323   ls = vec_splat(l, 6);
 324   vh = vec_sub(vec_add(ls, ah), tl);
 325   vl = vec_sub(vec_add(ls, al), tl);
 326   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 327   dst += stride;
 328
 329   ls = vec_splat(l, 7);
 330   vh = vec_sub(vec_add(ls, ah), tl);
 331   vl = vec_sub(vec_add(ls, al), tl);
 332   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 333 }
 334
 335 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
 336                                 const uint8_t *above, const uint8_t *left) {
 337   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
 338   const uint8x16_t l = vec_vsx_ld(0, left);
 339   const int16x8_t lh = unpack_to_s16_h(l);
 340   const int16x8_t ll = unpack_to_s16_l(l);
 341   const uint8x16_t a = vec_vsx_ld(0, above);
 342   const int16x8_t ah = unpack_to_s16_h(a);
 343   const int16x8_t al = unpack_to_s16_l(a);
 344
 345   tm_predictor_16x8(dst, stride, lh, ah, al, tl);
 346
 347   dst += stride * 8;
 348
 349   tm_predictor_16x8(dst, stride, ll, ah, al, tl);
 350 }
 351
 352 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
 353                                      const int16x8_t a0h, const int16x8_t a0l,
 354                                      const int16x8_t a1h, const int16x8_t a1l,
 355                                      const int16x8_t tl) {
 356   int16x8_t vh, vl;
 357
 358   vh = vec_sub(vec_add(ls, a0h), tl);
 359   vl = vec_sub(vec_add(ls, a0l), tl);
 360   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
 361   vh = vec_sub(vec_add(ls, a1h), tl);
 362   vl = vec_sub(vec_add(ls, a1l), tl);
 363   vec_vsx_st(vec_packsu(vh, vl), 16, dst);
 364 }
 365
 366 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
 367                               const int16x8_t l, const uint8x16_t a0,
 368                               const uint8x16_t a1, const int16x8_t tl) {
 369   const int16x8_t a0h = unpack_to_s16_h(a0);
 370   const int16x8_t a0l = unpack_to_s16_l(a0);
 371   const int16x8_t a1h = unpack_to_s16_h(a1);
 372   const int16x8_t a1l = unpack_to_s16_l(a1);
 373
 374   tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
 375   dst += stride;
 376
 377   tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
 378   dst += stride;
 379
 380   tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
 381   dst += stride;
 382
 383   tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
 384   dst += stride;
 385
 386   tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
 387   dst += stride;
 388
 389   tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
 390   dst += stride;
 391
 392   tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
 393   dst += stride;
 394
 395   tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
 396 }
 397
 398 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 399                                 const uint8_t *above, const uint8_t *left) {
 400   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
 401   const uint8x16_t l0 = vec_vsx_ld(0, left);
 402   const uint8x16_t l1 = vec_vsx_ld(16, left);
 403   const uint8x16_t a0 = vec_vsx_ld(0, above);
 404   const uint8x16_t a1 = vec_vsx_ld(16, above);
 405
 406   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
 407   dst += stride * 8;
 408
 409   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
 410   dst += stride * 8;
 411
 412   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
 413   dst += stride * 8;
 414
 415   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
 416 }
 417
 418 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
 419                                          const uint8x16_t val) {
 420   int i;
 421
 422   for (i = 0; i < 8; i++, dst += stride) {
 423     const uint8x16_t d = vec_vsx_ld(0, dst);
 424     vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
 425   }
 426 }
 427
 428 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
 429                                            const uint8x16_t val) {
 430   int i;
 431
 432   for (i = 0; i < 16; i++, dst += stride) {
 433     vec_vsx_st(val, 0, dst);
 434   }
 435 }
 436
 437 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
 438                                     const uint8_t *above, const uint8_t *left) {
 439   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
 440   (void)above;
 441   (void)left;
 442
 443   dc_fill_predictor_16x16(dst, stride, v128);
 444 }
 445
 446 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
 447                                            const uint8x16_t val) {
 448   int i;
 449
 450   for (i = 0; i < 32; i++, dst += stride) {
 451     vec_vsx_st(val, 0, dst);
 452     vec_vsx_st(val, 16, dst);
 453   }
 454 }
 455
 456 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 457                                     const uint8_t *above, const uint8_t *left) {
 458   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
 459   (void)above;
 460   (void)left;
 461
 462   dc_fill_predictor_32x32(dst, stride, v128);
 463 }
 464
 465 static uint8x16_t avg16(const uint8_t *values) {
 466   const int32x4_t sum4s =
 467       (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
 468   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
 469   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
 470
 471   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
 472                    3);
 473 }
 474
 475 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
 476                                      const uint8_t *above,
 477                                      const uint8_t *left) {
 478   (void)above;
 479
 480   dc_fill_predictor_16x16(dst, stride, avg16(left));
 481 }
 482
 483 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
 484                                     const uint8_t *above, const uint8_t *left) {
 485   (void)left;
 486
 487   dc_fill_predictor_16x16(dst, stride, avg16(above));
 488 }
 489
 490 static uint8x16_t avg32(const uint8_t *values) {
 491   const uint8x16_t v0 = vec_vsx_ld(0, values);
 492   const uint8x16_t v1 = vec_vsx_ld(16, values);
 493   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
 494   const int32x4_t sum4s =
 495       (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
 496   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
 497   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
 498
 499   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
 500                    3);
 501 }
 502
 503 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 504                                      const uint8_t *above,
 505                                      const uint8_t *left) {
 506   (void)above;
 507
 508   dc_fill_predictor_32x32(dst, stride, avg32(left));
 509 }
 510
 511 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 512                                     const uint8_t *above, const uint8_t *left) {
 513   (void)left;
 514
 515   dc_fill_predictor_32x32(dst, stride, avg32(above));
 516 }
 517
 518 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
 519   const uint8x16_t a0 = vec_vsx_ld(0, above);
 520   const uint8x16_t l0 = vec_vsx_ld(0, left);
 521   const int32x4_t sum4s =
 522       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
 523   const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
 524   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
 525   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
 526
 527   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
 528                    3);
 529 }
 530
 531 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
 532   const uint8x16_t a0 = vec_vsx_ld(0, above);
 533   const uint8x16_t l0 = vec_vsx_ld(0, left);
 534   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
 535   const int32x4_t sum4s =
 536       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
 537   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
 538   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
 539
 540   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
 541                    3);
 542 }
 543
 544 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
 545                               const uint8_t *above, const uint8_t *left) {
 546   dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
 547 }
 548
 549 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
 550                                 const uint8_t *above, const uint8_t *left) {
 551   dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
 552 }
 553
 554 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
 555   const uint8x16_t a0 = vec_vsx_ld(0, above);
 556   const uint8x16_t a1 = vec_vsx_ld(16, above);
 557   const uint8x16_t l0 = vec_vsx_ld(0, left);
 558   const uint8x16_t l1 = vec_vsx_ld(16, left);
 559   const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
 560   const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
 561   const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
 562   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
 563   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
 564
 565   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
 566                    3);
 567 }
 568
 569 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 570                                 const uint8_t *above, const uint8_t *left) {
 571   dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
 572 }
 573
 574 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
 575                        const uint8x16_t c) {
 576   const uint8x16_t ac =
 577       vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
 578
 579   return vec_avg(ac, b);
 580 }
 581
 582 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
 583 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
 584                                 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
 585
 586 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
 587                                const uint8_t *above, const uint8_t *left) {
 588   const uint8x16_t af = vec_vsx_ld(0, above);
 589   const uint8x16_t above_right = vec_splat(af, 7);
 590   const uint8x16_t a = xxpermdi(af, above_right, 1);
 591   const uint8x16_t b = vec_perm(a, above_right, sl1);
 592   const uint8x16_t c = vec_perm(b, above_right, sl1);
 593   uint8x16_t row = avg3(a, b, c);
 594   int i;
 595   (void)left;
 596
 597   for (i = 0; i < 8; i++) {
 598     const uint8x16_t d = vec_vsx_ld(0, dst);
 599     vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
 600     dst += stride;
 601     row = vec_perm(row, above_right, sl1);
 602   }
 603 }
 604
 605 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
 606                                  const uint8_t *above, const uint8_t *left) {
 607   const uint8x16_t a = vec_vsx_ld(0, above);
 608   const uint8x16_t above_right = vec_splat(a, 15);
 609   const uint8x16_t b = vec_perm(a, above_right, sl1);
 610   const uint8x16_t c = vec_perm(b, above_right, sl1);
 611   uint8x16_t row = avg3(a, b, c);
 612   int i;
 613   (void)left;
 614
 615   for (i = 0; i < 16; i++) {
 616     vec_vsx_st(row, 0, dst);
 617     dst += stride;
 618     row = vec_perm(row, above_right, sl1);
 619   }
 620 }
 621
 622 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 623                                  const uint8_t *above, const uint8_t *left) {
 624   const uint8x16_t a0 = vec_vsx_ld(0, above);
 625   const uint8x16_t a1 = vec_vsx_ld(16, above);
 626   const uint8x16_t above_right = vec_splat(a1, 15);
 627   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
 628   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
 629   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
 630   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
 631   uint8x16_t row0 = avg3(a0, b0, c0);
 632   uint8x16_t row1 = avg3(a1, b1, c1);
 633   int i;
 634   (void)left;
 635
 636   for (i = 0; i < 32; i++) {
 637     vec_vsx_st(row0, 0, dst);
 638     vec_vsx_st(row1, 16, dst);
 639     dst += stride;
 640     row0 = vec_perm(row0, row1, sl1);
 641     row1 = vec_perm(row1, above_right, sl1);
 642   }
 643 }
 644
 645 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
 646                                  const uint8_t *above, const uint8_t *left) {
 647   const uint8x16_t a0 = vec_vsx_ld(0, above);
 648   const uint8x16_t a1 = vec_vsx_ld(16, above);
 649   const uint8x16_t above_right = vec_splat(a1, 0);
 650   const uint8x16_t b = vec_perm(a0, above_right, sl1);
 651   const uint8x16_t c = vec_perm(b, above_right, sl1);
 652   uint8x16_t row0 = vec_avg(a0, b);
 653   uint8x16_t row1 = avg3(a0, b, c);
 654   int i;
 655   (void)left;
 656
 657   for (i = 0; i < 8; i++) {
 658     vec_vsx_st(row0, 0, dst);
 659     vec_vsx_st(row1, 0, dst + stride);
 660     dst += stride * 2;
 661     row0 = vec_perm(row0, above_right, sl1);
 662     row1 = vec_perm(row1, above_right, sl1);
 663   }
 664 }
 665
 666 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
 667                                  const uint8_t *above, const uint8_t *left) {
 668   const uint8x16_t a0 = vec_vsx_ld(0, above);
 669   const uint8x16_t a1 = vec_vsx_ld(16, above);
 670   const uint8x16_t a2 = vec_vsx_ld(32, above);
 671   const uint8x16_t above_right = vec_splat(a2, 0);
 672   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
 673   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
 674   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
 675   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
 676   uint8x16_t row0_0 = vec_avg(a0, b0);
 677   uint8x16_t row0_1 = vec_avg(a1, b1);
 678   uint8x16_t row1_0 = avg3(a0, b0, c0);
 679   uint8x16_t row1_1 = avg3(a1, b1, c1);
 680   int i;
 681   (void)left;
 682
 683   for (i = 0; i < 16; i++) {
 684     vec_vsx_st(row0_0, 0, dst);
 685     vec_vsx_st(row0_1, 16, dst);
 686     vec_vsx_st(row1_0, 0, dst + stride);
 687     vec_vsx_st(row1_1, 16, dst + stride);
 688     dst += stride * 2;
 689     row0_0 = vec_perm(row0_0, row0_1, sl1);
 690     row0_1 = vec_perm(row0_1, above_right, sl1);
 691     row1_0 = vec_perm(row1_0, row1_1, sl1);
 692     row1_1 = vec_perm(row1_1, above_right, sl1);
 693   }
 694 }