// vec_mladd results in overflow.
static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
int16x8_t dequant) {
- int16x8_t dqcoeff;
int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
// Add 1 if negative to round towards zero because the C uses division.
dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
- dqcoeff = vec_pack(dqcoeffe, dqcoeffo);
- return vec_perm(dqcoeff, dqcoeff, vec_perm_merge);
+ return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
}
void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
#include "vpx_dsp/ppc/txfm_common_vsx.h"
#include "vpx_dsp/ppc/transpose_vsx.h"
-static const uint8x16_t vec_perm_pack = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05,
- 0x14, 0x15, 0x08, 0x09, 0x18, 0x19,
- 0x0C, 0x0D, 0x1C, 0x1D };
-
// Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14.
static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add,
int16x8_t *sub) {
const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
// There's no pack operation for even and odd, so we need to permute.
- *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_pack);
- *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_pack);
+ *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+ *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
}
// Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14
const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
// There's no pack operation for even and odd, so we need to permute.
- *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_pack);
- *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_pack);
+ *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+ *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
}
// While other architecture combine the load and the stage 1 operations, Power9
// vec_mladd results in overflow.
static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
int16x8_t dequant) {
- int16x8_t dqcoeff;
int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
// Add 1 if negative to round towards zero because the C uses division.
dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
- dqcoeff = vec_pack(dqcoeffe, dqcoeffo);
- return vec_perm(dqcoeff, dqcoeff, vec_perm_merge);
+ return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
}
static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D,
0x0E, 0x0F, 0x00, 0x01 };
-static const uint8x16_t vec_perm_merge = { 0x00, 0x01, 0x08, 0x09, 0x02, 0x03,
- 0x0A, 0x0B, 0x04, 0x05, 0x0C, 0x0D,
- 0x06, 0x07, 0x0E, 0x0F };
+static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11,
+ 0x04, 0x05, 0x14, 0x15,
+ 0x08, 0x09, 0x18, 0x19,
+ 0x0C, 0x0D, 0x1C, 0x1D };
#endif // VPX_DSP_PPC_TYPES_VSX_H_