array_transpose_8x8(in, in);
}
+#if CONFIG_EXT_TX
+static void fidtx8_sse2(__m128i *in) {
+ in[0] = _mm_slli_epi16(in[0], 1);
+ in[1] = _mm_slli_epi16(in[1], 1);
+ in[2] = _mm_slli_epi16(in[2], 1);
+ in[3] = _mm_slli_epi16(in[3], 1);
+ in[4] = _mm_slli_epi16(in[4], 1);
+ in[5] = _mm_slli_epi16(in[5], 1);
+ in[6] = _mm_slli_epi16(in[6], 1);
+ in[7] = _mm_slli_epi16(in[7], 1);
+
+ array_transpose_8x8(in, in);
+}
+#endif // CONFIG_EXT_TX
+
void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in[8];
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
+ case V_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fdct8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case H_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fidtx8_sse2(in);
+ fdct8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case V_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fadst8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case H_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0);
+ fidtx8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case V_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 0);
+ fadst8_sse2(in);
+ fidtx8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
+ case H_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1);
+ fidtx8_sse2(in);
+ fadst8_sse2(in);
+ right_shift_8x8(in, 1);
+ write_buffer_8x8(output, in, 8);
+ break;
#endif // CONFIG_EXT_TX
default:
assert(0);
- break;
}
}
array_transpose_16x16(in0, in1);
}
+#if CONFIG_EXT_TX
+static void fidtx16_8col(__m128i *in) {
+ const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+ const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+ in[0] = _mm_slli_epi16(in[0], 1);
+ in[1] = _mm_slli_epi16(in[1], 1);
+ in[2] = _mm_slli_epi16(in[2], 1);
+ in[3] = _mm_slli_epi16(in[3], 1);
+ in[4] = _mm_slli_epi16(in[4], 1);
+ in[5] = _mm_slli_epi16(in[5], 1);
+ in[6] = _mm_slli_epi16(in[6], 1);
+ in[7] = _mm_slli_epi16(in[7], 1);
+ in[8] = _mm_slli_epi16(in[8], 1);
+ in[9] = _mm_slli_epi16(in[9], 1);
+ in[10] = _mm_slli_epi16(in[10], 1);
+ in[11] = _mm_slli_epi16(in[11], 1);
+ in[12] = _mm_slli_epi16(in[12], 1);
+ in[13] = _mm_slli_epi16(in[13], 1);
+ in[14] = _mm_slli_epi16(in[14], 1);
+ in[15] = _mm_slli_epi16(in[15], 1);
+
+ v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+ v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+ v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+ v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+ v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
+ v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
+ v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
+ v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
+
+ u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
+ u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
+ u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
+ u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
+ u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
+ u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
+ u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
+ u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
+
+ x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
+ x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
+ x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
+ x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
+ x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
+ x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
+ x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
+ x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
+
+ y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
+ y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
+ y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
+ y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
+ y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
+ y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
+ y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
+ y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
+
+ v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+ v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+ v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+ v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+ v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
+ v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
+ v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
+ v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
+
+ x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
+ x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
+ x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
+ x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
+ x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
+ x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
+ x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
+ x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
+
+ u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
+ u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
+ u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
+ u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
+ u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
+ u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
+ u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
+ u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
+
+ y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
+ y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
+ y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
+ y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
+ y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
+ y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
+ y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
+ y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
+
+ v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+ v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+ v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+ v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+ v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+ v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+ v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+ v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+ x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+ x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+ x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+ x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+ x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+ x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+ x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+ x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+
+ u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+ y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+ y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+ y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+ y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+ y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
+ y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
+ y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
+ y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
+
+ v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+ x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+ x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+ x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+ x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+ x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+ x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+ x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+ x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+ u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+ u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+ u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+ u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+ u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+ u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+ u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+ u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+ y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+ y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+ y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+ y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+ y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+ y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+ y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+ y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+ in[0] = _mm_packs_epi32(v0, x0);
+ in[1] = _mm_packs_epi32(v1, x1);
+ in[2] = _mm_packs_epi32(v2, x2);
+ in[3] = _mm_packs_epi32(v3, x3);
+ in[4] = _mm_packs_epi32(v4, x4);
+ in[5] = _mm_packs_epi32(v5, x5);
+ in[6] = _mm_packs_epi32(v6, x6);
+ in[7] = _mm_packs_epi32(v7, x7);
+
+ in[8] = _mm_packs_epi32(u0, y0);
+ in[9] = _mm_packs_epi32(u1, y1);
+ in[10] = _mm_packs_epi32(u2, y2);
+ in[11] = _mm_packs_epi32(u3, y3);
+ in[12] = _mm_packs_epi32(u4, y4);
+ in[13] = _mm_packs_epi32(u5, y5);
+ in[14] = _mm_packs_epi32(u6, y6);
+ in[15] = _mm_packs_epi32(u7, y7);
+}
+
+static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
+ fidtx16_8col(in0);
+ fidtx16_8col(in1);
+ array_transpose_16x16(in0, in1);
+}
+#endif // CONFIG_EXT_TX
+
void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
+ case V_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fdct16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case H_DCT:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fdct16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case V_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case H_ADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 0);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case V_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 1, 0);
+ fadst16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fidtx16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
+ case H_FLIPADST:
+ load_buffer_16x16(input, in0, in1, stride, 0, 1);
+ fidtx16_sse2(in0, in1);
+ right_shift_16x16(in0, in1);
+ fadst16_sse2(in0, in1);
+ write_buffer_16x16(output, in0, in1, 16);
+ break;
#endif // CONFIG_EXT_TX
default:
assert(0);