2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/ppc/types_vsx.h"
14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
15 const uint8_t *above, const uint8_t *left) {
16 const uint8x16_t d = vec_vsx_ld(0, above);
20 for (i = 0; i < 16; i++, dst += stride) {
21 vec_vsx_st(d, 0, dst);
25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
26 const uint8_t *above, const uint8_t *left) {
27 const uint8x16_t d0 = vec_vsx_ld(0, above);
28 const uint8x16_t d1 = vec_vsx_ld(16, above);
32 for (i = 0; i < 32; i++, dst += stride) {
33 vec_vsx_st(d0, 0, dst);
34 vec_vsx_st(d1, 16, dst);
38 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
40 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
41 const uint8_t *above, const uint8_t *left) {
42 const uint8x16_t d = vec_vsx_ld(0, left);
43 const uint8x16_t v0 = vec_splat(d, 0);
44 const uint8x16_t v1 = vec_splat(d, 1);
45 const uint8x16_t v2 = vec_splat(d, 2);
46 const uint8x16_t v3 = vec_splat(d, 3);
50 vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
52 vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
54 vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
56 vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
59 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
60 const uint8_t *above, const uint8_t *left) {
61 const uint8x16_t d = vec_vsx_ld(0, left);
62 const uint8x16_t v0 = vec_splat(d, 0);
63 const uint8x16_t v1 = vec_splat(d, 1);
64 const uint8x16_t v2 = vec_splat(d, 2);
65 const uint8x16_t v3 = vec_splat(d, 3);
67 const uint8x16_t v4 = vec_splat(d, 4);
68 const uint8x16_t v5 = vec_splat(d, 5);
69 const uint8x16_t v6 = vec_splat(d, 6);
70 const uint8x16_t v7 = vec_splat(d, 7);
72 const uint8x16_t v8 = vec_splat(d, 8);
73 const uint8x16_t v9 = vec_splat(d, 9);
74 const uint8x16_t v10 = vec_splat(d, 10);
75 const uint8x16_t v11 = vec_splat(d, 11);
77 const uint8x16_t v12 = vec_splat(d, 12);
78 const uint8x16_t v13 = vec_splat(d, 13);
79 const uint8x16_t v14 = vec_splat(d, 14);
80 const uint8x16_t v15 = vec_splat(d, 15);
84 vec_vsx_st(v0, 0, dst);
86 vec_vsx_st(v1, 0, dst);
88 vec_vsx_st(v2, 0, dst);
90 vec_vsx_st(v3, 0, dst);
92 vec_vsx_st(v4, 0, dst);
94 vec_vsx_st(v5, 0, dst);
96 vec_vsx_st(v6, 0, dst);
98 vec_vsx_st(v7, 0, dst);
100 vec_vsx_st(v8, 0, dst);
102 vec_vsx_st(v9, 0, dst);
104 vec_vsx_st(v10, 0, dst);
106 vec_vsx_st(v11, 0, dst);
108 vec_vsx_st(v12, 0, dst);
110 vec_vsx_st(v13, 0, dst);
112 vec_vsx_st(v14, 0, dst);
114 vec_vsx_st(v15, 0, dst);
117 #define H_PREDICTOR_32(v) \
118 vec_vsx_st(v, 0, dst); \
119 vec_vsx_st(v, 16, dst); \
122 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
123 const uint8_t *above, const uint8_t *left) {
124 const uint8x16_t d0 = vec_vsx_ld(0, left);
125 const uint8x16_t d1 = vec_vsx_ld(16, left);
127 const uint8x16_t v0_0 = vec_splat(d0, 0);
128 const uint8x16_t v1_0 = vec_splat(d0, 1);
129 const uint8x16_t v2_0 = vec_splat(d0, 2);
130 const uint8x16_t v3_0 = vec_splat(d0, 3);
131 const uint8x16_t v4_0 = vec_splat(d0, 4);
132 const uint8x16_t v5_0 = vec_splat(d0, 5);
133 const uint8x16_t v6_0 = vec_splat(d0, 6);
134 const uint8x16_t v7_0 = vec_splat(d0, 7);
135 const uint8x16_t v8_0 = vec_splat(d0, 8);
136 const uint8x16_t v9_0 = vec_splat(d0, 9);
137 const uint8x16_t v10_0 = vec_splat(d0, 10);
138 const uint8x16_t v11_0 = vec_splat(d0, 11);
139 const uint8x16_t v12_0 = vec_splat(d0, 12);
140 const uint8x16_t v13_0 = vec_splat(d0, 13);
141 const uint8x16_t v14_0 = vec_splat(d0, 14);
142 const uint8x16_t v15_0 = vec_splat(d0, 15);
144 const uint8x16_t v0_1 = vec_splat(d1, 0);
145 const uint8x16_t v1_1 = vec_splat(d1, 1);
146 const uint8x16_t v2_1 = vec_splat(d1, 2);
147 const uint8x16_t v3_1 = vec_splat(d1, 3);
148 const uint8x16_t v4_1 = vec_splat(d1, 4);
149 const uint8x16_t v5_1 = vec_splat(d1, 5);
150 const uint8x16_t v6_1 = vec_splat(d1, 6);
151 const uint8x16_t v7_1 = vec_splat(d1, 7);
152 const uint8x16_t v8_1 = vec_splat(d1, 8);
153 const uint8x16_t v9_1 = vec_splat(d1, 9);
154 const uint8x16_t v10_1 = vec_splat(d1, 10);
155 const uint8x16_t v11_1 = vec_splat(d1, 11);
156 const uint8x16_t v12_1 = vec_splat(d1, 12);
157 const uint8x16_t v13_1 = vec_splat(d1, 13);
158 const uint8x16_t v14_1 = vec_splat(d1, 14);
159 const uint8x16_t v15_1 = vec_splat(d1, 15);
163 H_PREDICTOR_32(v0_0);
164 H_PREDICTOR_32(v1_0);
165 H_PREDICTOR_32(v2_0);
166 H_PREDICTOR_32(v3_0);
168 H_PREDICTOR_32(v4_0);
169 H_PREDICTOR_32(v5_0);
170 H_PREDICTOR_32(v6_0);
171 H_PREDICTOR_32(v7_0);
173 H_PREDICTOR_32(v8_0);
174 H_PREDICTOR_32(v9_0);
175 H_PREDICTOR_32(v10_0);
176 H_PREDICTOR_32(v11_0);
178 H_PREDICTOR_32(v12_0);
179 H_PREDICTOR_32(v13_0);
180 H_PREDICTOR_32(v14_0);
181 H_PREDICTOR_32(v15_0);
183 H_PREDICTOR_32(v0_1);
184 H_PREDICTOR_32(v1_1);
185 H_PREDICTOR_32(v2_1);
186 H_PREDICTOR_32(v3_1);
188 H_PREDICTOR_32(v4_1);
189 H_PREDICTOR_32(v5_1);
190 H_PREDICTOR_32(v6_1);
191 H_PREDICTOR_32(v7_1);
193 H_PREDICTOR_32(v8_1);
194 H_PREDICTOR_32(v9_1);
195 H_PREDICTOR_32(v10_1);
196 H_PREDICTOR_32(v11_1);
198 H_PREDICTOR_32(v12_1);
199 H_PREDICTOR_32(v13_1);
200 H_PREDICTOR_32(v14_1);
201 H_PREDICTOR_32(v15_1);
204 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
205 const uint8_t *above, const uint8_t *left) {
206 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
207 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
208 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
212 d = vec_vsx_ld(0, dst);
213 tmp = unpack_to_s16_l(d);
214 val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
215 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
218 d = vec_vsx_ld(0, dst);
219 tmp = unpack_to_s16_l(d);
220 val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
221 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
224 d = vec_vsx_ld(0, dst);
225 tmp = unpack_to_s16_l(d);
226 val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
227 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
230 d = vec_vsx_ld(0, dst);
231 tmp = unpack_to_s16_l(d);
232 val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
233 vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
236 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
237 const uint8_t *above, const uint8_t *left) {
238 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
239 const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
240 const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
243 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
244 val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
245 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
248 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
249 val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
250 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
253 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
254 val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
255 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
258 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
259 val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
260 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
263 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
264 val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
265 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
268 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
269 val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
270 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
273 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
274 val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
275 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
278 tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
279 val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
280 vec_vsx_st(vec_packsu(val, tmp), 0, dst);
283 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
284 int16x8_t ah, int16x8_t al, int16x8_t tl) {
285 int16x8_t vh, vl, ls;
287 ls = vec_splat(l, 0);
288 vh = vec_sub(vec_add(ls, ah), tl);
289 vl = vec_sub(vec_add(ls, al), tl);
290 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
293 ls = vec_splat(l, 1);
294 vh = vec_sub(vec_add(ls, ah), tl);
295 vl = vec_sub(vec_add(ls, al), tl);
296 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
299 ls = vec_splat(l, 2);
300 vh = vec_sub(vec_add(ls, ah), tl);
301 vl = vec_sub(vec_add(ls, al), tl);
302 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
305 ls = vec_splat(l, 3);
306 vh = vec_sub(vec_add(ls, ah), tl);
307 vl = vec_sub(vec_add(ls, al), tl);
308 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
311 ls = vec_splat(l, 4);
312 vh = vec_sub(vec_add(ls, ah), tl);
313 vl = vec_sub(vec_add(ls, al), tl);
314 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
317 ls = vec_splat(l, 5);
318 vh = vec_sub(vec_add(ls, ah), tl);
319 vl = vec_sub(vec_add(ls, al), tl);
320 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
323 ls = vec_splat(l, 6);
324 vh = vec_sub(vec_add(ls, ah), tl);
325 vl = vec_sub(vec_add(ls, al), tl);
326 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
329 ls = vec_splat(l, 7);
330 vh = vec_sub(vec_add(ls, ah), tl);
331 vl = vec_sub(vec_add(ls, al), tl);
332 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
335 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
336 const uint8_t *above, const uint8_t *left) {
337 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
338 const uint8x16_t l = vec_vsx_ld(0, left);
339 const int16x8_t lh = unpack_to_s16_h(l);
340 const int16x8_t ll = unpack_to_s16_l(l);
341 const uint8x16_t a = vec_vsx_ld(0, above);
342 const int16x8_t ah = unpack_to_s16_h(a);
343 const int16x8_t al = unpack_to_s16_l(a);
345 tm_predictor_16x8(dst, stride, lh, ah, al, tl);
349 tm_predictor_16x8(dst, stride, ll, ah, al, tl);
352 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
353 const int16x8_t a0h, const int16x8_t a0l,
354 const int16x8_t a1h, const int16x8_t a1l,
355 const int16x8_t tl) {
358 vh = vec_sub(vec_add(ls, a0h), tl);
359 vl = vec_sub(vec_add(ls, a0l), tl);
360 vec_vsx_st(vec_packsu(vh, vl), 0, dst);
361 vh = vec_sub(vec_add(ls, a1h), tl);
362 vl = vec_sub(vec_add(ls, a1l), tl);
363 vec_vsx_st(vec_packsu(vh, vl), 16, dst);
366 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
367 const int16x8_t l, const uint8x16_t a0,
368 const uint8x16_t a1, const int16x8_t tl) {
369 const int16x8_t a0h = unpack_to_s16_h(a0);
370 const int16x8_t a0l = unpack_to_s16_l(a0);
371 const int16x8_t a1h = unpack_to_s16_h(a1);
372 const int16x8_t a1l = unpack_to_s16_l(a1);
374 tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
377 tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
380 tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
383 tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
386 tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
389 tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
392 tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
395 tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
398 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
399 const uint8_t *above, const uint8_t *left) {
400 const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
401 const uint8x16_t l0 = vec_vsx_ld(0, left);
402 const uint8x16_t l1 = vec_vsx_ld(16, left);
403 const uint8x16_t a0 = vec_vsx_ld(0, above);
404 const uint8x16_t a1 = vec_vsx_ld(16, above);
406 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
409 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
412 tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
415 tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
418 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
419 const uint8x16_t val) {
422 for (i = 0; i < 8; i++, dst += stride) {
423 const uint8x16_t d = vec_vsx_ld(0, dst);
424 vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
428 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
429 const uint8x16_t val) {
432 for (i = 0; i < 16; i++, dst += stride) {
433 vec_vsx_st(val, 0, dst);
437 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
438 const uint8_t *above, const uint8_t *left) {
439 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
443 dc_fill_predictor_16x16(dst, stride, v128);
446 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
447 const uint8x16_t val) {
450 for (i = 0; i < 32; i++, dst += stride) {
451 vec_vsx_st(val, 0, dst);
452 vec_vsx_st(val, 16, dst);
456 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
457 const uint8_t *above, const uint8_t *left) {
458 const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
462 dc_fill_predictor_32x32(dst, stride, v128);
465 static uint8x16_t avg16(const uint8_t *values) {
466 const int32x4_t sum4s =
467 (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
468 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
469 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
471 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
475 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
476 const uint8_t *above,
477 const uint8_t *left) {
480 dc_fill_predictor_16x16(dst, stride, avg16(left));
483 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
484 const uint8_t *above, const uint8_t *left) {
487 dc_fill_predictor_16x16(dst, stride, avg16(above));
490 static uint8x16_t avg32(const uint8_t *values) {
491 const uint8x16_t v0 = vec_vsx_ld(0, values);
492 const uint8x16_t v1 = vec_vsx_ld(16, values);
493 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
494 const int32x4_t sum4s =
495 (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
496 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
497 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
499 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
503 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
504 const uint8_t *above,
505 const uint8_t *left) {
508 dc_fill_predictor_32x32(dst, stride, avg32(left));
511 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
512 const uint8_t *above, const uint8_t *left) {
515 dc_fill_predictor_32x32(dst, stride, avg32(above));
518 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
519 const uint8x16_t a0 = vec_vsx_ld(0, above);
520 const uint8x16_t l0 = vec_vsx_ld(0, left);
521 const int32x4_t sum4s =
522 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
523 const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
524 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
525 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
527 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
531 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
532 const uint8x16_t a0 = vec_vsx_ld(0, above);
533 const uint8x16_t l0 = vec_vsx_ld(0, left);
534 const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
535 const int32x4_t sum4s =
536 (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
537 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
538 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
540 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
544 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
545 const uint8_t *above, const uint8_t *left) {
546 dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
549 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
550 const uint8_t *above, const uint8_t *left) {
551 dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
554 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
555 const uint8x16_t a0 = vec_vsx_ld(0, above);
556 const uint8x16_t a1 = vec_vsx_ld(16, above);
557 const uint8x16_t l0 = vec_vsx_ld(0, left);
558 const uint8x16_t l1 = vec_vsx_ld(16, left);
559 const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
560 const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
561 const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
562 const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
563 const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
565 return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
569 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
570 const uint8_t *above, const uint8_t *left) {
571 dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
574 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
575 const uint8x16_t c) {
576 const uint8x16_t ac =
577 vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
579 return vec_avg(ac, b);
582 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
583 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
584 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
586 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
587 const uint8_t *above, const uint8_t *left) {
588 const uint8x16_t af = vec_vsx_ld(0, above);
589 const uint8x16_t above_right = vec_splat(af, 7);
590 const uint8x16_t a = xxpermdi(af, above_right, 1);
591 const uint8x16_t b = vec_perm(a, above_right, sl1);
592 const uint8x16_t c = vec_perm(b, above_right, sl1);
593 uint8x16_t row = avg3(a, b, c);
597 for (i = 0; i < 8; i++) {
598 const uint8x16_t d = vec_vsx_ld(0, dst);
599 vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
601 row = vec_perm(row, above_right, sl1);
605 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
606 const uint8_t *above, const uint8_t *left) {
607 const uint8x16_t a = vec_vsx_ld(0, above);
608 const uint8x16_t above_right = vec_splat(a, 15);
609 const uint8x16_t b = vec_perm(a, above_right, sl1);
610 const uint8x16_t c = vec_perm(b, above_right, sl1);
611 uint8x16_t row = avg3(a, b, c);
615 for (i = 0; i < 16; i++) {
616 vec_vsx_st(row, 0, dst);
618 row = vec_perm(row, above_right, sl1);
622 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
623 const uint8_t *above, const uint8_t *left) {
624 const uint8x16_t a0 = vec_vsx_ld(0, above);
625 const uint8x16_t a1 = vec_vsx_ld(16, above);
626 const uint8x16_t above_right = vec_splat(a1, 15);
627 const uint8x16_t b0 = vec_perm(a0, a1, sl1);
628 const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
629 const uint8x16_t c0 = vec_perm(b0, b1, sl1);
630 const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
631 uint8x16_t row0 = avg3(a0, b0, c0);
632 uint8x16_t row1 = avg3(a1, b1, c1);
636 for (i = 0; i < 32; i++) {
637 vec_vsx_st(row0, 0, dst);
638 vec_vsx_st(row1, 16, dst);
640 row0 = vec_perm(row0, row1, sl1);
641 row1 = vec_perm(row1, above_right, sl1);
645 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
646 const uint8_t *above, const uint8_t *left) {
647 const uint8x16_t a0 = vec_vsx_ld(0, above);
648 const uint8x16_t a1 = vec_vsx_ld(16, above);
649 const uint8x16_t above_right = vec_splat(a1, 0);
650 const uint8x16_t b = vec_perm(a0, above_right, sl1);
651 const uint8x16_t c = vec_perm(b, above_right, sl1);
652 uint8x16_t row0 = vec_avg(a0, b);
653 uint8x16_t row1 = avg3(a0, b, c);
657 for (i = 0; i < 8; i++) {
658 vec_vsx_st(row0, 0, dst);
659 vec_vsx_st(row1, 0, dst + stride);
661 row0 = vec_perm(row0, above_right, sl1);
662 row1 = vec_perm(row1, above_right, sl1);
666 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
667 const uint8_t *above, const uint8_t *left) {
668 const uint8x16_t a0 = vec_vsx_ld(0, above);
669 const uint8x16_t a1 = vec_vsx_ld(16, above);
670 const uint8x16_t a2 = vec_vsx_ld(32, above);
671 const uint8x16_t above_right = vec_splat(a2, 0);
672 const uint8x16_t b0 = vec_perm(a0, a1, sl1);
673 const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
674 const uint8x16_t c0 = vec_perm(b0, b1, sl1);
675 const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
676 uint8x16_t row0_0 = vec_avg(a0, b0);
677 uint8x16_t row0_1 = vec_avg(a1, b1);
678 uint8x16_t row1_0 = avg3(a0, b0, c0);
679 uint8x16_t row1_1 = avg3(a1, b1, c1);
683 for (i = 0; i < 16; i++) {
684 vec_vsx_st(row0_0, 0, dst);
685 vec_vsx_st(row0_1, 16, dst);
686 vec_vsx_st(row1_0, 0, dst + stride);
687 vec_vsx_st(row1_1, 16, dst + stride);
689 row0_0 = vec_perm(row0_0, row0_1, sl1);
690 row0_1 = vec_perm(row0_1, above_right, sl1);
691 row1_0 = vec_perm(row1_0, row1_1, sl1);
692 row1_1 = vec_perm(row1_1, above_right, sl1);