From: Alexandra Hájková Date: Sat, 26 May 2018 13:10:10 +0000 (+0000) Subject: ppc: add vp9_iht8x8_64_add_vsx X-Git-Tag: v1.8.0~589^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4997a29c861e5bbd7d00a8a375a65160b3f6a695;p=libvpx ppc: add vp9_iht8x8_64_add_vsx Change-Id: I99a9535bf1ae58c494113fc88d9616bda202716a --- diff --git a/test/dct_test.cc b/test/dct_test.cc index 0d13e6b14..9abba980c 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -684,12 +684,13 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH #if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH -static const FuncInfo ht_vsx_func_info[1] = { - { &vp9_fht4x4_c, &iht_wrapper, 4, 1 } +static const FuncInfo ht_vsx_func_info[2] = { + { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, + { &vp9_fht8x8_c, &iht_wrapper, 8, 1 } }; INSTANTIATE_TEST_CASE_P(VSX, TransHT, - ::testing::Combine(::testing::Range(0, 1), + ::testing::Combine(::testing::Range(0, 2), ::testing::Values(ht_vsx_func_info), ::testing::Range(0, 4), ::testing::Values(VPX_BITS_8))); diff --git a/vp9/common/ppc/vp9_idct_vsx.c b/vp9/common/ppc/vp9_idct_vsx.c index a7b66e7fd..a9426b168 100644 --- a/vp9/common/ppc/vp9_idct_vsx.c +++ b/vp9/common/ppc/vp9_idct_vsx.c @@ -45,3 +45,40 @@ void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, vpx_round_store4x4_vsx(in, out, dest, stride); } + +void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[8], out[8]; + + // load input data + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + in[2] = load_tran_low(2 * 8 * sizeof(*input), input); + in[3] = load_tran_low(3 * 8 * sizeof(*input), input); + in[4] = load_tran_low(4 * 8 * sizeof(*input), input); + in[5] = load_tran_low(5 * 8 * sizeof(*input), input); + in[6] = load_tran_low(6 * 8 * sizeof(*input), input); + in[7] = load_tran_low(7 * 8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + case ADST_DCT: + vpx_idct8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + } + + vpx_round_store8x8_vsx(in, dest, stride); +} diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c5e99764a..4afbffa90 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -68,7 +68,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # Note that there are more specializations appended when # CONFIG_VP9_HIGHBITDEPTH is off. specialize qw/vp9_iht4x4_16_add neon sse2 vsx/; - specialize qw/vp9_iht8x8_64_add neon sse2/; + specialize qw/vp9_iht8x8_64_add neon sse2 vsx/; specialize qw/vp9_iht16x16_256_add neon sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c index 89b5d87bf..59b053190 100644 --- a/vpx_dsp/ppc/inv_txfm_vsx.c +++ b/vpx_dsp/ppc/inv_txfm_vsx.c @@ -19,63 +19,85 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" -static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, - 16364, 16364, 16364, 16364 }; -static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, - 16305, 16305, 16305, 16305 }; -static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, - 16207, 16207, 16207, 16207 }; -static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, - 16069, 16069, 16069, 16069 }; -static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, - -16069, -16069, -16069, -16069 }; -static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, - 15893, 15893, 15893, 15893 }; -static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, - 15679, 15679, 15679, 15679 }; -static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, - 15426, 15426, 15426, 15426 }; -static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, - 15137, 15137, 15137, 15137 }; -static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, - -15137, -15137, -15137, -15137 }; -static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, - 14811, 14811, 14811, 14811 }; -static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, - 14449, 14449, 14449, 14449 }; -static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, - 14053, 14053, 14053, 14053 }; -static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, - 13623, 13623, 13623, 13623 }; -static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, - 13160, 13160, 13160, 13160 }; -static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, - 12665, 12665, 12665, 12665 }; -static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, - 12140, 12140, 12140, 12140 }; -static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, - 11585, 11585, 11585, 11585 }; -static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, - 11003, 11003, 11003, 11003 }; -static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, - 10394, 10394, 10394, 10394 }; -static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 }; -static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 }; -static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, - -9102, -9102, -9102, -9102 }; -static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 }; -static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 }; -static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 }; -static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 }; -static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270, - -6270, -6270, -6270, -6270 }; -static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 }; -static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 }; -static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 }; -static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 }; -static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; -static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; -static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; +static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364, + 16364, 16364, 16364, 16364 }; +static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305, + 16305, 16305, 16305, 16305 }; +static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305, + -16305, -16305, -16305, -16305 }; +static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207, + 16207, 16207, 16207, 16207 }; +static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069, + 16069, 16069, 16069, 16069 }; +static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069, + -16069, -16069, -16069, -16069 }; +static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893, + 15893, 15893, 15893, 15893 }; +static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679, + 15679, 15679, 15679, 15679 }; +static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426, + 15426, 15426, 15426, 15426 }; +static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137, + 15137, 15137, 15137, 15137 }; +static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137, + -15137, -15137, -15137, -15137 }; +static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811, + 14811, 14811, 14811, 14811 }; +static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449, + 14449, 14449, 14449, 14449 }; +static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449, + -14449, -14449, -14449, -14449 }; +static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053, + 14053, 14053, 14053, 14053 }; +static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623, + 13623, 13623, 13623, 13623 }; +static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160, + 13160, 13160, 13160, 13160 }; +static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665, + 12665, 12665, 12665, 12665 }; +static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140, + 12140, 12140, 12140, 12140 }; +static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585, + 11585, 11585, 11585, 11585 }; +static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003, + 11003, 11003, 11003, 11003 }; +static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394, + 10394, 10394, 10394, 10394 }; +static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394, + -10394, -10394, -10394, -10394 }; +static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, + 9760, 9760, 9760, 9760 }; +static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, + 9102, 9102, 9102, 9102 }; +static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102, + -9102, -9102, -9102, -9102 }; +static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585, + -11585, -11585, -11585, -11585 }; +static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, + 8423, 8423, 8423, 8423 }; +static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, + 7723, 7723, 7723, 7723 }; +static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, + 7005, 7005, 7005, 7005 }; +static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, + 6270, 6270, 6270, 6270 }; +static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270, + -6270, -6270, -6270, -6270 }; +static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, + 5520, 5520, 5520, 5520 }; +static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, + 4756, 4756, 4756, 4756 }; +static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756, + -4756, -4756, -4756, -4756 }; +static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, + 3981, 3981, 3981, 3981 }; +static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, + 3196, 3196, 3196, 3196 }; +static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, + 2404, 2404, 2404, 2404 }; +static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, + 1606, 1606, 1606, 1606 }; +static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283, 5283, 5283, 5283, 5283 }; @@ -86,11 +108,16 @@ static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377, static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212, 15212, 15212, 15212, 15212 }; -static const uint8x16_t mask1 = { +static uint8x16_t tr8_mask0 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; +static uint8x16_t tr8_mask1 = { + 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F +}; + #define ROUND_SHIFT_INIT \ const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ const uint32x4_t shift14 = vec_splat_u32(14); @@ -122,13 +149,13 @@ static const uint8x16_t mask1 = { out1 = vec_sub(step0, step1); \ out1 = vec_perm(out1, out1, mask0); -#define PACK_STORE(v0, v1) \ - tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); \ - tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); \ - output_v = vec_packsu(tmp16_0, tmp16_1); \ - \ - vec_vsx_st(output_v, 0, tmp_dest); \ - for (i = 0; i < 4; i++) \ +#define PACK_STORE(v0, v1) \ + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \ + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \ + output_v = vec_packsu(tmp16_0, tmp16_1); \ + \ + vec_vsx_st(output_v, 0, tmp_dest); \ + for (i = 0; i < 4; i++) \ for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, @@ -283,28 +310,20 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, #define PIXEL_ADD(in, out, add, shiftx) \ out = vec_add(vec_sra(vec_add(in, add), shiftx), out); -static uint8x16_t tr8_mask0 = { - 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 -}; -static uint8x16_t tr8_mask1 = { - 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, - 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F -}; -void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, - int stride) { - int32x4_t temp10, temp11; +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) { int16x8_t step0, step1, step2, step3, step4, step5, step6, step7; - int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1, - tmp16_2, tmp16_3; - int16x8_t src0 = load_tran_low(0, input); - int16x8_t src1 = load_tran_low(8 * sizeof(*input), input); - int16x8_t src2 = load_tran_low(16 * sizeof(*input), input); - int16x8_t src3 = load_tran_low(24 * sizeof(*input), input); - int16x8_t src4 = load_tran_low(32 * sizeof(*input), input); - int16x8_t src5 = load_tran_low(40 * sizeof(*input), input); - int16x8_t src6 = load_tran_low(48 * sizeof(*input), input); - int16x8_t src7 = load_tran_low(56 * sizeof(*input), input); + int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3; + int32x4_t temp10, temp11; + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]); +} + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) { + uint8x16_t zerov = vec_splat_u8(0); uint8x16_t dest0 = vec_vsx_ld(0, dest); uint8x16_t dest1 = vec_vsx_ld(stride, dest); uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); @@ -313,7 +332,6 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest); uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest); uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest); - uint8x16_t zerov = vec_splat_u8(0); int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov); int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); @@ -325,23 +343,15 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1)); uint16x8_t shift5 = vec_splat_u16(5); uint8x16_t output0, output1, output2, output3; - ROUND_SHIFT_INIT; - TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2, - tmp3, tmp4, tmp5, tmp6, tmp7); - - IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2, - src3, src4, src5, src6, src7); - IDCT8(src0, src1, src2, src3, src4, src5, src6, src7); - PIXEL_ADD(src0, d_u0, add, shift5); - PIXEL_ADD(src1, d_u1, add, shift5); - PIXEL_ADD(src2, d_u2, add, shift5); - PIXEL_ADD(src3, d_u3, add, shift5); - PIXEL_ADD(src4, d_u4, add, shift5); - PIXEL_ADD(src5, d_u5, add, shift5); - PIXEL_ADD(src6, d_u6, add, shift5); - PIXEL_ADD(src7, d_u7, add, shift5); + PIXEL_ADD(in[0], d_u0, add, shift5); + PIXEL_ADD(in[1], d_u1, add, shift5); + PIXEL_ADD(in[2], d_u2, add, shift5); + PIXEL_ADD(in[3], d_u3, add, shift5); + PIXEL_ADD(in[4], d_u4, add, shift5); + PIXEL_ADD(in[5], d_u5, add, shift5); + PIXEL_ADD(in[6], d_u6, add, shift5); + PIXEL_ADD(in[7], d_u7, add, shift5); output0 = vec_packsu(d_u0, d_u1); output1 = vec_packsu(d_u2, d_u3); output2 = vec_packsu(d_u4, d_u5); @@ -357,6 +367,25 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest); } +void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t src[8], tmp[8]; + + src[0] = load_tran_low(0, input); + src[1] = load_tran_low(8 * sizeof(*input), input); + src[2] = load_tran_low(16 * sizeof(*input), input); + src[3] = load_tran_low(24 * sizeof(*input), input); + src[4] = load_tran_low(32 * sizeof(*input), input); + src[5] = load_tran_low(40 * sizeof(*input), input); + src[6] = load_tran_low(48 * sizeof(*input), input); + src[7] = load_tran_low(56 * sizeof(*input), input); + + vpx_idct8_vsx(src, tmp); + vpx_idct8_vsx(tmp, src); + + vpx_round_store8x8_vsx(src, dest, stride); +} + #define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \ in6, in7, in8, in9, inA, inB, inC, inD, inE, inF) \ in0 = load(offset, source); \ @@ -474,9 +503,9 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, tmp16_0 = vec_mergeh(outA, outD); \ tmp16_1 = vec_mergel(outA, outD); \ temp10 = \ - vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v)); \ + vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v)); \ temp11 = \ - vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v)); \ + vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v)); \ DCT_CONST_ROUND_SHIFT(temp10); \ DCT_CONST_ROUND_SHIFT(temp11); \ inA = vec_packs(temp10, temp11); \ @@ -1200,3 +1229,171 @@ void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) { out[0] = vec_packs(u_v[0], u_v[1]); out[1] = vec_packs(u_v[2], u_v[3]); } + +#define MSUM_ROUND_SHIFT(a, b, cospi) \ + b = vec_msums(a, cospi, zerov); \ + DCT_CONST_ROUND_SHIFT(b); + +#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \ + MSUM_ROUND_SHIFT(in0, tmp0, cospi); \ + MSUM_ROUND_SHIFT(in1, tmp1, cospi); \ + out = vec_packs(tmp0, tmp1); + +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) { + int32x4_t tmp0[16], tmp1[16]; + + int32x4_t zerov = vec_splat_s32(0); + int16x8_t zero16v = vec_splat_s16(0); + int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v); + int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v); + int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v); + int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v); + int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v); + int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v); + int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v); + int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v); + int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v); + int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v); + int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v); + int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v); + ROUND_SHIFT_INIT; + + TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0], + out[1], out[2], out[3], out[4], out[5], out[6], out[7]); + + // stage 1 + // interleave and multiply/add into 32-bit integer + in[0] = vec_mergeh(out[7], out[0]); + in[1] = vec_mergel(out[7], out[0]); + in[2] = vec_mergeh(out[5], out[2]); + in[3] = vec_mergel(out[5], out[2]); + in[4] = vec_mergeh(out[3], out[4]); + in[5] = vec_mergel(out[3], out[4]); + in[6] = vec_mergeh(out[1], out[6]); + in[7] = vec_mergel(out[1], out[6]); + + tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov); + tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov); + tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov); + tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov); + tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov); + tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov); + tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov); + tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov); + tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov); + tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov); + tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov); + tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov); + tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov); + tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov); + tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov); + tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[8]); + tmp0[1] = vec_add(tmp1[1], tmp1[9]); + tmp0[2] = vec_add(tmp1[2], tmp1[10]); + tmp0[3] = vec_add(tmp1[3], tmp1[11]); + tmp0[4] = vec_add(tmp1[4], tmp1[12]); + tmp0[5] = vec_add(tmp1[5], tmp1[13]); + tmp0[6] = vec_add(tmp1[6], tmp1[14]); + tmp0[7] = vec_add(tmp1[7], tmp1[15]); + tmp0[8] = vec_sub(tmp1[0], tmp1[8]); + tmp0[9] = vec_sub(tmp1[1], tmp1[9]); + tmp0[10] = vec_sub(tmp1[2], tmp1[10]); + tmp0[11] = vec_sub(tmp1[3], tmp1[11]); + tmp0[12] = vec_sub(tmp1[4], tmp1[12]); + tmp0[13] = vec_sub(tmp1[5], tmp1[13]); + tmp0[14] = vec_sub(tmp1[6], tmp1[14]); + tmp0[15] = vec_sub(tmp1[7], tmp1[15]); + + // shift and rounding + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + DCT_CONST_ROUND_SHIFT(tmp0[8]); + DCT_CONST_ROUND_SHIFT(tmp0[9]); + DCT_CONST_ROUND_SHIFT(tmp0[10]); + DCT_CONST_ROUND_SHIFT(tmp0[11]); + DCT_CONST_ROUND_SHIFT(tmp0[12]); + DCT_CONST_ROUND_SHIFT(tmp0[13]); + DCT_CONST_ROUND_SHIFT(tmp0[14]); + DCT_CONST_ROUND_SHIFT(tmp0[15]); + + // back to 16-bit + out[0] = vec_packs(tmp0[0], tmp0[1]); + out[1] = vec_packs(tmp0[2], tmp0[3]); + out[2] = vec_packs(tmp0[4], tmp0[5]); + out[3] = vec_packs(tmp0[6], tmp0[7]); + out[4] = vec_packs(tmp0[8], tmp0[9]); + out[5] = vec_packs(tmp0[10], tmp0[11]); + out[6] = vec_packs(tmp0[12], tmp0[13]); + out[7] = vec_packs(tmp0[14], tmp0[15]); + + // stage 2 + in[0] = vec_add(out[0], out[2]); + in[1] = vec_add(out[1], out[3]); + in[2] = vec_sub(out[0], out[2]); + in[3] = vec_sub(out[1], out[3]); + in[4] = vec_mergeh(out[4], out[5]); + in[5] = vec_mergel(out[4], out[5]); + in[6] = vec_mergeh(out[6], out[7]); + in[7] = vec_mergel(out[6], out[7]); + + tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov); + tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov); + tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov); + tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov); + tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov); + tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov); + tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov); + tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov); + + tmp0[0] = vec_add(tmp1[0], tmp1[4]); + tmp0[1] = vec_add(tmp1[1], tmp1[5]); + tmp0[2] = vec_add(tmp1[2], tmp1[6]); + tmp0[3] = vec_add(tmp1[3], tmp1[7]); + tmp0[4] = vec_sub(tmp1[0], tmp1[4]); + tmp0[5] = vec_sub(tmp1[1], tmp1[5]); + tmp0[6] = vec_sub(tmp1[2], tmp1[6]); + tmp0[7] = vec_sub(tmp1[3], tmp1[7]); + + DCT_CONST_ROUND_SHIFT(tmp0[0]); + DCT_CONST_ROUND_SHIFT(tmp0[1]); + DCT_CONST_ROUND_SHIFT(tmp0[2]); + DCT_CONST_ROUND_SHIFT(tmp0[3]); + DCT_CONST_ROUND_SHIFT(tmp0[4]); + DCT_CONST_ROUND_SHIFT(tmp0[5]); + DCT_CONST_ROUND_SHIFT(tmp0[6]); + DCT_CONST_ROUND_SHIFT(tmp0[7]); + + in[4] = vec_packs(tmp0[0], tmp0[1]); + in[5] = vec_packs(tmp0[2], tmp0[3]); + in[6] = vec_packs(tmp0[4], tmp0[5]); + in[7] = vec_packs(tmp0[6], tmp0[7]); + + // stage 3 + out[0] = vec_mergeh(in[2], in[3]); + out[1] = vec_mergel(in[2], in[3]); + out[2] = vec_mergeh(in[6], in[7]); + out[3] = vec_mergel(in[6], in[7]); + + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v); + IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v); + IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v); + + out[0] = in[0]; + out[2] = in[6]; + out[4] = in[3]; + out[6] = in[5]; + + out[1] = vec_sub(zero16v, in[4]); + out[3] = vec_sub(zero16v, in[2]); + out[5] = vec_sub(zero16v, in[7]); + out[7] = vec_sub(zero16v, in[1]); +} diff --git a/vpx_dsp/ppc/inv_txfm_vsx.h b/vpx_dsp/ppc/inv_txfm_vsx.h index 1b8c71d14..7df87a73e 100644 --- a/vpx_dsp/ppc/inv_txfm_vsx.h +++ b/vpx_dsp/ppc/inv_txfm_vsx.h @@ -4,3 +4,7 @@ void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest, int stride); void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out); void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out); + +void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride); +void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out); +void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out);