#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
- const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
- const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
- return _mm_packs_epi32(t0, t1);
-}
-
-static INLINE void highbd_write_buffer_8x1(uint16_t *dest, const __m128i in,
- const int bd) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- __m128i out;
-
- out = _mm_adds_epi16(in, final_rounding);
- out = _mm_srai_epi16(out, 6);
- recon_and_store_8_kernel(out, &dest, 0, bd);
-}
-
-static INLINE void recon_and_store_4_kernel(const __m128i in,
- uint16_t *const dest,
- const int bd) {
- __m128i d;
-
- d = _mm_loadl_epi64((const __m128i *)dest);
- d = add_clamp(d, in, bd);
- _mm_storel_epi64((__m128i *)dest, d);
-}
-
-static INLINE void highbd_write_buffer_4x1(uint16_t *const dest,
- const __m128i in, const int bd) {
- const __m128i final_rounding = _mm_set1_epi32(1 << 5);
- __m128i out;
-
- out = _mm_add_epi32(in, final_rounding);
- out = _mm_srai_epi32(out, 6);
- out = _mm_packs_epi32(out, out);
- recon_and_store_4_kernel(out, dest, bd);
-}
-
static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
__m128i *const out) {
__m128i temp1[2], temp2, sign[2];
out[15] = in[15];
}
-static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
- __m128i *const out) {
- out[0] = _mm_add_epi32(in[0], in[15]);
- out[1] = _mm_add_epi32(in[1], in[14]);
- out[2] = _mm_add_epi32(in[2], in[13]);
- out[3] = _mm_add_epi32(in[3], in[12]);
- out[4] = _mm_add_epi32(in[4], in[11]);
- out[5] = _mm_add_epi32(in[5], in[10]);
- out[6] = _mm_add_epi32(in[6], in[9]);
- out[7] = _mm_add_epi32(in[7], in[8]);
- out[8] = _mm_sub_epi32(in[7], in[8]);
- out[9] = _mm_sub_epi32(in[6], in[9]);
- out[10] = _mm_sub_epi32(in[5], in[10]);
- out[11] = _mm_sub_epi32(in[4], in[11]);
- out[12] = _mm_sub_epi32(in[3], in[12]);
- out[13] = _mm_sub_epi32(in[2], in[13]);
- out[14] = _mm_sub_epi32(in[1], in[14]);
- out[15] = _mm_sub_epi32(in[0], in[15]);
-}
-
static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
__m128i step1[16], step2[16];
__m128i temp1[4], temp2, sign[2];
input += 128;
}
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < 16; i += 8) {
int j;
- transpose_16bit_8x8(l + i * 8, out);
- transpose_16bit_8x8(r + i * 8, out + 8);
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
idct16_8col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
}
dest += 8;
}
input += 4 * 16;
}
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 16; i += 4) {
int j;
- out[0] = all[0][4 * i + 0];
- out[1] = all[1][4 * i + 0];
- out[2] = all[0][4 * i + 1];
- out[3] = all[1][4 * i + 1];
- out[4] = all[0][4 * i + 2];
- out[5] = all[1][4 * i + 2];
- out[6] = all[0][4 * i + 3];
- out[7] = all[1][4 * i + 3];
+ out[0] = all[0][i + 0];
+ out[1] = all[1][i + 0];
+ out[2] = all[0][i + 1];
+ out[3] = all[1][i + 1];
+ out[4] = all[0][i + 2];
+ out[5] = all[1][i + 2];
+ out[6] = all[0][i + 3];
+ out[7] = all[1][i + 3];
transpose_32bit_8x4(out, out);
- out[8] = all[2][4 * i + 0];
- out[9] = all[3][4 * i + 0];
- out[10] = all[2][4 * i + 1];
- out[11] = all[3][4 * i + 1];
- out[12] = all[2][4 * i + 2];
- out[13] = all[3][4 * i + 2];
- out[14] = all[2][4 * i + 3];
- out[15] = all[3][4 * i + 3];
+ out[8] = all[2][i + 0];
+ out[9] = all[3][i + 0];
+ out[10] = all[2][i + 1];
+ out[11] = all[3][i + 1];
+ out[12] = all[2][i + 2];
+ out[13] = all[3][i + 2];
+ out[14] = all[2][i + 3];
+ out[15] = all[3][i + 3];
transpose_32bit_8x4(out + 8, out + 8);
highbd_idct16_4col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
dest += 4;
}
in[15] = _mm_setzero_si128();
idct16_8col(in);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < 16; i += 8) {
int j;
- transpose_16bit_8x8(in + i * 8, out);
+ transpose_16bit_8x8(in + i, out);
out[8] = _mm_setzero_si128();
out[9] = _mm_setzero_si128();
out[10] = _mm_setzero_si128();
idct16_8col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
}
dest += 8;
}
input += 4 * 16;
}
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 16; i += 4) {
int j;
- out[0] = all[0][4 * i + 0];
- out[1] = all[1][4 * i + 0];
- out[2] = all[0][4 * i + 1];
- out[3] = all[1][4 * i + 1];
- out[4] = all[0][4 * i + 2];
- out[5] = all[1][4 * i + 2];
- out[6] = all[0][4 * i + 3];
- out[7] = all[1][4 * i + 3];
+ out[0] = all[0][i + 0];
+ out[1] = all[1][i + 0];
+ out[2] = all[0][i + 1];
+ out[3] = all[1][i + 1];
+ out[4] = all[0][i + 2];
+ out[5] = all[1][i + 2];
+ out[6] = all[0][i + 3];
+ out[7] = all[1][i + 3];
transpose_32bit_8x4(out, out);
highbd_idct16x16_38_4col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
dest += 4;
}
idct16x16_10_pass1(in, l);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < 16; i += 8) {
int j;
- idct16x16_10_pass2(l + 8 * i, in);
+ idct16x16_10_pass2(l + i, in);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_8x1(dest + j * stride, in[j], bd);
+ highbd_write_buffer_8(dest + j * stride, in[j], bd);
}
dest += 8;
}
input += 4 * 16;
}
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 16; i += 4) {
int j;
- transpose_32bit_4x4(&all[0][4 * i], out);
+ transpose_32bit_4x4(&all[0][i], out);
highbd_idct16x16_10_4col(out);
for (j = 0; j < 16; ++j) {
- highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
}
dest += 4;
}
io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+ __m128i *const out) {
+ out[0] = _mm_add_epi32(in[0], in[15]);
+ out[1] = _mm_add_epi32(in[1], in[14]);
+ out[2] = _mm_add_epi32(in[2], in[13]);
+ out[3] = _mm_add_epi32(in[3], in[12]);
+ out[4] = _mm_add_epi32(in[4], in[11]);
+ out[5] = _mm_add_epi32(in[5], in[10]);
+ out[6] = _mm_add_epi32(in[6], in[9]);
+ out[7] = _mm_add_epi32(in[7], in[8]);
+ out[8] = _mm_sub_epi32(in[7], in[8]);
+ out[9] = _mm_sub_epi32(in[6], in[9]);
+ out[10] = _mm_sub_epi32(in[5], in[10]);
+ out[11] = _mm_sub_epi32(in[4], in[11]);
+ out[12] = _mm_sub_epi32(in[3], in[12]);
+ out[13] = _mm_sub_epi32(in[2], in[13]);
+ out[14] = _mm_sub_epi32(in[1], in[14]);
+ out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
const int bd) {
const __m128i zero = _mm_set1_epi16(0);
}
}
-static INLINE void recon_and_store_4_dual(const __m128i in,
- uint16_t *const dest,
- const int stride, const int bd) {
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+ const int bd) {
+ __m128i d;
+
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_clamp(d, in, bd);
+ _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+ const int stride, const int bd) {
__m128i d;
d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
_mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
}
-static INLINE void recon_and_store_4(const __m128i *const in, uint16_t *dest,
- const int stride, const int bd) {
- recon_and_store_4_dual(in[0], dest, stride, bd);
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_4x2(in[0], dest, stride, bd);
dest += 2 * stride;
- recon_and_store_4_dual(in[1], dest, stride, bd);
+ recon_and_store_4x2(in[1], dest, stride, bd);
}
-static INLINE void recon_and_store_8_kernel(const __m128i in,
- uint16_t **const dest,
- const int stride, const int bd) {
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+ const int stride, const int bd) {
__m128i d;
d = _mm_load_si128((const __m128i *)(*dest));
*dest += stride;
}
-static INLINE void recon_and_store_8(const __m128i *const in, uint16_t *dest,
- const int stride, const int bd) {
- recon_and_store_8_kernel(in[0], &dest, stride, bd);
- recon_and_store_8_kernel(in[1], &dest, stride, bd);
- recon_and_store_8_kernel(in[2], &dest, stride, bd);
- recon_and_store_8_kernel(in[3], &dest, stride, bd);
- recon_and_store_8_kernel(in[4], &dest, stride, bd);
- recon_and_store_8_kernel(in[5], &dest, stride, bd);
- recon_and_store_8_kernel(in[6], &dest, stride, bd);
- recon_and_store_8_kernel(in[7], &dest, stride, bd);
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+ const int stride, const int bd) {
+ recon_and_store_8(in[0], &dest, stride, bd);
+ recon_and_store_8(in[1], &dest, stride, bd);
+ recon_and_store_8(in[2], &dest, stride, bd);
+ recon_and_store_8(in[3], &dest, stride, bd);
+ recon_and_store_8(in[4], &dest, stride, bd);
+ recon_and_store_8(in[5], &dest, stride, bd);
+ recon_and_store_8(in[6], &dest, stride, bd);
+ recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+ const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+ const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+ return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i out;
+
+ out = _mm_adds_epi16(in, final_rounding);
+ out = _mm_srai_epi16(out, 6);
+ recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+ const int bd) {
+ const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+ __m128i out;
+
+ out = _mm_add_epi32(in, final_rounding);
+ out = _mm_srai_epi32(out, 6);
+ out = _mm_packs_epi32(out, out);
+ recon_and_store_4(out, dest, bd);
}
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_