* be found in the AUTHORS file in the root of the source tree.
*/
+#include <string.h>
#include "test/acm_random.h"
#include "test/register_state_check.h"
#include "test/util.h"
protected:
static const int kDataAlignment = 16;
- static const int kOuterBlockSize = 128;
+ static const int kOuterBlockSize = 256;
static const int kInputStride = kOuterBlockSize;
static const int kOutputStride = kOuterBlockSize;
static const int kMaxDimension = 64;
input_[i] = prng.Rand8Extremes();
}
+ void SetConstantInput(int value) {
+ memset(input_, value, kInputBufferSize);
+ }
+
void CheckGuardBlocks() {
for (int i = 0; i < kOutputBufferSize; ++i) {
if (IsIndexInBorder(i))
}
}
+/* This test exercises that enough rows and columns are filtered with every
+ possible initial fractional positions and scaling steps. */
+TEST_P(ConvolveTest, CheckScalingFiltering) {
+ uint8_t* const in = input();
+ uint8_t* const out = output();
+
+ SetConstantInput(127);
+
+ for (int frac = 0; frac < 16; ++frac) {
+ for (int step = 1; step <= 32; ++step) {
+ /* Test the horizontal and vertical filters in combination. */
+ REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
+ vp9_sub_pel_filters_8[frac], step,
+ vp9_sub_pel_filters_8[frac], step,
+ Width(), Height()));
+
+ CheckGuardBlocks();
+
+ for (int y = 0; y < Height(); ++y) {
+ for (int x = 0; x < Width(); ++x) {
+ ASSERT_EQ(in[y * kInputStride + x], out[y * kOutputStride + x])
+ << "x == " << x << ", y == " << y
+ << ", frac == " << frac << ", step == " << step;
+ }
+ }
+ }
+ }
+}
+
using std::tr1::make_tuple;
const ConvolveFunctions convolve8_c(
* h == 64, taps == 8.
*/
uint8_t temp[64 * 135];
- int intermediate_height = MAX(((h * y_step_q4) >> 4), 1) + taps - 1;
+ int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;
assert(w <= 64);
assert(h <= 64);
assert(y_step_q4 <= 32);
assert(x_step_q4 <= 32);
- if (intermediate_height < h)
- intermediate_height = h;
-
convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
filter_x, x_step_q4, filter_y, y_step_q4, w,
intermediate_height, taps);
specialize vp9_quantize_b $ssse3_x86_64
prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
-specialize vp9_quantize_b_32x32 $ssse3_x86_64
+specialize vp9_quantize_b_32x32 # $ssse3_x86_64 FIXME(jingning): need a unit test on thisbefore enabled
#
# Structured Similarity (SSIM)
const unsigned char *_blimit,
const unsigned char *_limit,
const unsigned char *_thresh) {
- DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]);
- DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]);
-
- DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]);
-
- DECLARE_ALIGNED(16, unsigned char, ap[8][8]);
- DECLARE_ALIGNED(16, unsigned char, aq[8][8]);
-
-
__m128i mask, hev, flat, flat2;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
- __m128i p7, p6, p5;
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
- __m128i q5, q6, q7;
- int i = 0;
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
const unsigned int extended_limit = _limit[0] * 0x01010101u;
const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
const __m128i blimit =
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
- p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
- p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p));
-
- _mm_storel_epi64((__m128i *)ap[4], p4);
- _mm_storel_epi64((__m128i *)ap[3], p3);
- _mm_storel_epi64((__m128i *)ap[2], p2);
- _mm_storel_epi64((__m128i *)ap[1], p1);
- _mm_storel_epi64((__m128i *)ap[0], p0);
- _mm_storel_epi64((__m128i *)aq[4], q4);
- _mm_storel_epi64((__m128i *)aq[3], q3);
- _mm_storel_epi64((__m128i *)aq[2], q2);
- _mm_storel_epi64((__m128i *)aq[1], q1);
- _mm_storel_epi64((__m128i *)aq[0], q0);
-
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+ q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
+ (__m64 *)(s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
+ (__m64 *)(s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
+ (__m64 *)(s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
+ (__m64 *)(s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
+ (__m64 *)(s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
{
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
- _mm_subs_epu8(q0, p0));
- __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
- _mm_subs_epu8(q1, p1));
- __m128i work;
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+ _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+ _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+ _mm_subs_epu8(p1q1, q1p1));
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu8(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(flat, mask);
+ mask = _mm_max_epu8(abs_p1p0, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
- _mm_subs_epu8(p1, p2)),
- _mm_or_si128(_mm_subs_epu8(p3, p2),
- _mm_subs_epu8(p2, p3)));
- mask = _mm_max_epu8(work, mask);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
- _mm_subs_epu8(q1, q2)),
- _mm_or_si128(_mm_subs_epu8(q3, q2),
- _mm_subs_epu8(q2, q3)));
+
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+ _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+ _mm_subs_epu8(q2p2, q3p3)));
mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
mask = _mm_subs_epu8(mask, limit);
mask = _mm_cmpeq_epi8(mask, zero);
}
const __m128i t4 = _mm_set1_epi8(4);
const __m128i t3 = _mm_set1_epi8(3);
const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i te0 = _mm_set1_epi8(0xe0);
- const __m128i t1f = _mm_set1_epi8(0x1f);
- const __m128i t1 = _mm_set1_epi8(0x1);
- const __m128i t7f = _mm_set1_epi8(0x7f);
-
- __m128i ps1 = _mm_xor_si128(p1, t80);
- __m128i ps0 = _mm_xor_si128(p0, t80);
- __m128i qs0 = _mm_xor_si128(q0, t80);
- __m128i qs1 = _mm_xor_si128(q1, t80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
- filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
- work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filt = _mm_adds_epi8(filt, work_a);
filter1 = _mm_adds_epi8(filt, t4);
filter2 = _mm_adds_epi8(filt, t3);
- /* Filter1 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter1);
- filter1 = _mm_srli_epi16(filter1, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter1 = _mm_and_si128(filter1, t1f);
- filter1 = _mm_or_si128(filter1, work_a);
- qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
- /* Filter2 >> 3 */
- work_a = _mm_cmpgt_epi8(zero, filter2);
- filter2 = _mm_srli_epi16(filter2, 3);
- work_a = _mm_and_si128(work_a, te0);
- filter2 = _mm_and_si128(filter2, t1f);
- filter2 = _mm_or_si128(filter2, work_a);
- ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
/* filt >> 1 */
- filt = _mm_adds_epi8(filter1, t1);
- work_a = _mm_cmpgt_epi8(zero, filt);
- filt = _mm_srli_epi16(filt, 1);
- work_a = _mm_and_si128(work_a, t80);
- filt = _mm_and_si128(filt, t7f);
- filt = _mm_or_si128(filt, work_a);
- filt = _mm_andnot_si128(hev, filt);
- ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
+ filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
// loopfilter done
{
__m128i work;
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
- _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0),
- _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
- _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0),
- _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
- _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0),
- _mm_subs_epu8(q0, q4)));
+ flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+ _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+ _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
flat = _mm_subs_epu8(flat, one);
flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask);
- p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
- q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p));
- flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
- _mm_subs_epu8(p0, p5)),
- _mm_or_si128(_mm_subs_epu8(q5, q0),
- _mm_subs_epu8(q0, q5)));
- _mm_storel_epi64((__m128i *)ap[5], p5);
- _mm_storel_epi64((__m128i *)aq[5], q5);
- flat2 = _mm_max_epu8(work, flat2);
- p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
- q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p));
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
- _mm_subs_epu8(p0, p6)),
- _mm_or_si128(_mm_subs_epu8(q6, q0),
- _mm_subs_epu8(q0, q6)));
- _mm_storel_epi64((__m128i *)ap[6], p6);
- _mm_storel_epi64((__m128i *)aq[6], q6);
- flat2 = _mm_max_epu8(work, flat2);
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+ q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
+ (__m64 *)(s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+ q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
+ (__m64 *)(s + 6 * p)));
+
+ flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+ _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+ _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+ q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
+ (__m64 *)(s + 7 * p)));
+
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+ _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+ _mm_subs_epu8(q0p0, q7p7)));
- p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
- q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p));
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
- _mm_subs_epu8(p0, p7)),
- _mm_or_si128(_mm_subs_epu8(q7, q0),
- _mm_subs_epu8(q0, q7)));
- _mm_storel_epi64((__m128i *)ap[7], p7);
- _mm_storel_epi64((__m128i *)aq[7], q7);
flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
flat2 = _mm_subs_epu8(flat2, one);
flat2 = _mm_cmpeq_epi8(flat2, zero);
flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
{
const __m128i eight = _mm_set1_epi16(8);
const __m128i four = _mm_set1_epi16(4);
- {
- __m128i workp_shft;
- __m128i a, b, c;
-
- p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero);
- p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero);
- p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero);
- p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero);
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero);
- q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero);
- q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero);
- q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero);
- q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero);
-
- c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
- c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
-
- b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
- a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
- a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
-
- _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
-
- c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q1, a);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
- _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
-
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q2, a);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
- _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
-
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q3, a);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
- _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
-
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- b = _mm_add_epi16(q3, b);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
- _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
-
- c = _mm_add_epi16(q4, c);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- b = _mm_add_epi16(q3, b);
- b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
- _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
- _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
- , b));
- a = _mm_add_epi16(q5, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q6, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- a = _mm_add_epi16(q7, a);
- c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
- }
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
+ pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(four,
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(p7_16, p0_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(q7_16, q0_16)), 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p1_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q1_16)), 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p2_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q2_16)), 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p3_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q3_16)), 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p4_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q4_16)), 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p5_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q5_16)), 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(sum_p7, p6_16)), 4);
+ res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
+ _mm_add_epi16(sum_q7, q6_16)), 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
}
// wide flat
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- work_a = _mm_loadl_epi64((__m128i *)ap[2]);
- p2 = _mm_loadl_epi64((__m128i *)flat_op[2]);
- work_a = _mm_andnot_si128(flat, work_a);
- p2 = _mm_and_si128(flat, p2);
- p2 = _mm_or_si128(work_a, p2);
- _mm_storel_epi64((__m128i *)flat_op[2], p2);
-
- p1 = _mm_loadl_epi64((__m128i *)flat_op[1]);
- work_a = _mm_andnot_si128(flat, ps1);
- p1 = _mm_and_si128(flat, p1);
- p1 = _mm_or_si128(work_a, p1);
- _mm_storel_epi64((__m128i *)flat_op[1], p1);
-
- p0 = _mm_loadl_epi64((__m128i *)flat_op[0]);
- work_a = _mm_andnot_si128(flat, ps0);
- p0 = _mm_and_si128(flat, p0);
- p0 = _mm_or_si128(work_a, p0);
- _mm_storel_epi64((__m128i *)flat_op[0], p0);
-
- q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]);
- work_a = _mm_andnot_si128(flat, qs0);
- q0 = _mm_and_si128(flat, q0);
- q0 = _mm_or_si128(work_a, q0);
- _mm_storel_epi64((__m128i *)flat_oq[0], q0);
-
- q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]);
- work_a = _mm_andnot_si128(flat, qs1);
- q1 = _mm_and_si128(flat, q1);
- q1 = _mm_or_si128(work_a, q1);
- _mm_storel_epi64((__m128i *)flat_oq[1], q1);
-
- work_a = _mm_loadl_epi64((__m128i *)aq[2]);
- q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]);
- work_a = _mm_andnot_si128(flat, work_a);
- q2 = _mm_and_si128(flat, q2);
- q2 = _mm_or_si128(work_a, q2);
- _mm_storel_epi64((__m128i *)flat_oq[2], q2);
-
- // write out op6 - op3
- {
- unsigned char *dst = (s - 7 * p);
- for (i = 6; i > 2; i--) {
- __m128i flat2_output;
- work_a = _mm_loadl_epi64((__m128i *)ap[i]);
- flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]);
- work_a = _mm_andnot_si128(flat2, work_a);
- flat2_output = _mm_and_si128(flat2, flat2_output);
- work_a = _mm_or_si128(work_a, flat2_output);
- _mm_storel_epi64((__m128i *)dst, work_a);
- dst += p;
- }
- }
-
- work_a = _mm_loadl_epi64((__m128i *)flat_op[2]);
- p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]);
- work_a = _mm_andnot_si128(flat2, work_a);
- p2 = _mm_and_si128(flat2, p2);
- p2 = _mm_or_si128(work_a, p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-
- work_a = _mm_loadl_epi64((__m128i *)flat_op[1]);
- p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]);
- work_a = _mm_andnot_si128(flat2, work_a);
- p1 = _mm_and_si128(flat2, p1);
- p1 = _mm_or_si128(work_a, p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-
- work_a = _mm_loadl_epi64((__m128i *)flat_op[0]);
- p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]);
- work_a = _mm_andnot_si128(flat2, work_a);
- p0 = _mm_and_si128(flat2, p0);
- p0 = _mm_or_si128(work_a, p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-
- work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]);
- q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]);
- work_a = _mm_andnot_si128(flat2, work_a);
- q0 = _mm_and_si128(flat2, q0);
- q0 = _mm_or_si128(work_a, q0);
- _mm_storel_epi64((__m128i *)(s - 0 * p), q0);
-
- work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]);
- q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]);
- work_a = _mm_andnot_si128(flat2, work_a);
- q1 = _mm_and_si128(flat2, q1);
- q1 = _mm_or_si128(work_a, q1);
- _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-
- work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]);
- q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]);
- work_a = _mm_andnot_si128(flat2, work_a);
- q2 = _mm_and_si128(flat2, q2);
- q2 = _mm_or_si128(work_a, q2);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-
- // write out oq3 - oq7
- {
- unsigned char *dst = (s + 3 * p);
- for (i = 3; i < 7; i++) {
- __m128i flat2_output;
- work_a = _mm_loadl_epi64((__m128i *)aq[i]);
- flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]);
- work_a = _mm_andnot_si128(flat2, work_a);
- flat2_output = _mm_and_si128(flat2, flat2_output);
- work_a = _mm_or_si128(work_a, flat2_output);
- _mm_storel_epi64((__m128i *)dst, work_a);
- dst += p;
- }
- }
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
}
}
output[30] = step[30];
output[31] = step[31];
+ // dump the magnitude by 4, hence the intermediate values are within
+ // the range of 16 bits.
+ if (round) {
+ output[0] = half_round_shift(output[0]);
+ output[1] = half_round_shift(output[1]);
+ output[2] = half_round_shift(output[2]);
+ output[3] = half_round_shift(output[3]);
+ output[4] = half_round_shift(output[4]);
+ output[5] = half_round_shift(output[5]);
+ output[6] = half_round_shift(output[6]);
+ output[7] = half_round_shift(output[7]);
+ output[8] = half_round_shift(output[8]);
+ output[9] = half_round_shift(output[9]);
+ output[10] = half_round_shift(output[10]);
+ output[11] = half_round_shift(output[11]);
+ output[12] = half_round_shift(output[12]);
+ output[13] = half_round_shift(output[13]);
+ output[14] = half_round_shift(output[14]);
+ output[15] = half_round_shift(output[15]);
+
+ output[16] = half_round_shift(output[16]);
+ output[17] = half_round_shift(output[17]);
+ output[18] = half_round_shift(output[18]);
+ output[19] = half_round_shift(output[19]);
+ output[20] = half_round_shift(output[20]);
+ output[21] = half_round_shift(output[21]);
+ output[22] = half_round_shift(output[22]);
+ output[23] = half_round_shift(output[23]);
+ output[24] = half_round_shift(output[24]);
+ output[25] = half_round_shift(output[25]);
+ output[26] = half_round_shift(output[26]);
+ output[27] = half_round_shift(output[27]);
+ output[28] = half_round_shift(output[28]);
+ output[29] = half_round_shift(output[29]);
+ output[30] = half_round_shift(output[30]);
+ output[31] = half_round_shift(output[31]);
+ }
+
// Stage 3
step[0] = output[0] + output[(8 - 1)];
step[1] = output[1] + output[(8 - 2)];
step[30] = output[30] + output[25];
step[31] = output[31] + output[24];
- // dump the magnitude by half, hence the intermediate values are within 1108
- // the range of 16 bits.
- if (round) {
- step[0] = half_round_shift(step[0]);
- step[1] = half_round_shift(step[1]);
- step[2] = half_round_shift(step[2]);
- step[3] = half_round_shift(step[3]);
- step[4] = half_round_shift(step[4]);
- step[5] = half_round_shift(step[5]);
- step[6] = half_round_shift(step[6]);
- step[7] = half_round_shift(step[7]);
- step[8] = half_round_shift(step[8]);
- step[9] = half_round_shift(step[9]);
- step[10] = half_round_shift(step[10]);
- step[11] = half_round_shift(step[11]);
- step[12] = half_round_shift(step[12]);
- step[13] = half_round_shift(step[13]);
- step[14] = half_round_shift(step[14]);
- step[15] = half_round_shift(step[15]);
-
- step[16] = half_round_shift(step[16]);
- step[17] = half_round_shift(step[17]);
- step[18] = half_round_shift(step[18]);
- step[19] = half_round_shift(step[19]);
- step[20] = half_round_shift(step[20]);
- step[21] = half_round_shift(step[21]);
- step[22] = half_round_shift(step[22]);
- step[23] = half_round_shift(step[23]);
- step[24] = half_round_shift(step[24]);
- step[25] = half_round_shift(step[25]);
- step[26] = half_round_shift(step[26]);
- step[27] = half_round_shift(step[27]);
- step[28] = half_round_shift(step[28]);
- step[29] = half_round_shift(step[29]);
- step[30] = half_round_shift(step[30]);
- step[31] = half_round_shift(step[31]);
- }
-
// Stage 4
output[0] = step[0] + step[3];
output[1] = step[1] + step[2];
(sse_v - var_v < thresh_dc || sse_v == var_v)) {
x->skip = 1;
- *rate2 = 500;
- *rate_uv = 0;
+ // The cost of skip bit needs to be added.
+ *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
// Scaling factor for SSE from spatial domain to frequency domain
// is 16. Adjust distortion accordingly.
step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
}
+
+#if !FDCT32x32_HIGH_PRECISION
+ // dump the magnitude by half, hence the intermediate values are within
+ // the range of 16 bits.
+ if (1 == pass) {
+ __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
+ __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
+ __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
+ __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
+ __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
+ __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
+ __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
+ __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
+ __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
+ __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
+ __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
+ __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
+ __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
+ __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
+ __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
+ __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
+ __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
+ __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
+ __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
+ __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
+ __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
+ __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
+ __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
+ __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
+ __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
+ __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
+ __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
+ __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
+ __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
+ __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
+ __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
+ __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
+
+ step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
+ step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
+ step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
+ step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
+ step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
+ step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
+ step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
+ step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
+ step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
+ step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
+ step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
+ step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
+ step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
+ step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
+ step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
+ step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
+ step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
+ step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
+ step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
+ step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
+ step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
+ step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
+ step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
+ step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
+ step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
+ step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
+ step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
+ step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
+ step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
+ step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
+ step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
+ step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
+
+ step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
+ step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
+ step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
+ step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
+ step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
+ step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
+ step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
+ step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
+ step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
+ step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+ step2[10] = _mm_add_epi16(step2[10], kOne);
+ step2[11] = _mm_add_epi16(step2[11], kOne);
+ step2[12] = _mm_add_epi16(step2[12], kOne);
+ step2[13] = _mm_add_epi16(step2[13], kOne);
+ step2[14] = _mm_add_epi16(step2[14], kOne);
+ step2[15] = _mm_add_epi16(step2[15], kOne);
+ step1[16] = _mm_add_epi16(step1[16], kOne);
+ step1[17] = _mm_add_epi16(step1[17], kOne);
+ step1[18] = _mm_add_epi16(step1[18], kOne);
+ step1[19] = _mm_add_epi16(step1[19], kOne);
+ step2[20] = _mm_add_epi16(step2[20], kOne);
+ step2[21] = _mm_add_epi16(step2[21], kOne);
+ step2[22] = _mm_add_epi16(step2[22], kOne);
+ step2[23] = _mm_add_epi16(step2[23], kOne);
+ step2[24] = _mm_add_epi16(step2[24], kOne);
+ step2[25] = _mm_add_epi16(step2[25], kOne);
+ step2[26] = _mm_add_epi16(step2[26], kOne);
+ step2[27] = _mm_add_epi16(step2[27], kOne);
+ step1[28] = _mm_add_epi16(step1[28], kOne);
+ step1[29] = _mm_add_epi16(step1[29], kOne);
+ step1[30] = _mm_add_epi16(step1[30], kOne);
+ step1[31] = _mm_add_epi16(step1[31], kOne);
+
+ step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
+ step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
+ step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
+ step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
+ step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
+ step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
+ step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
+ step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
+ step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
+ step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+ step2[10] = _mm_srai_epi16(step2[10], 2);
+ step2[11] = _mm_srai_epi16(step2[11], 2);
+ step2[12] = _mm_srai_epi16(step2[12], 2);
+ step2[13] = _mm_srai_epi16(step2[13], 2);
+ step2[14] = _mm_srai_epi16(step2[14], 2);
+ step2[15] = _mm_srai_epi16(step2[15], 2);
+ step1[16] = _mm_srai_epi16(step1[16], 2);
+ step1[17] = _mm_srai_epi16(step1[17], 2);
+ step1[18] = _mm_srai_epi16(step1[18], 2);
+ step1[19] = _mm_srai_epi16(step1[19], 2);
+ step2[20] = _mm_srai_epi16(step2[20], 2);
+ step2[21] = _mm_srai_epi16(step2[21], 2);
+ step2[22] = _mm_srai_epi16(step2[22], 2);
+ step2[23] = _mm_srai_epi16(step2[23], 2);
+ step2[24] = _mm_srai_epi16(step2[24], 2);
+ step2[25] = _mm_srai_epi16(step2[25], 2);
+ step2[26] = _mm_srai_epi16(step2[26], 2);
+ step2[27] = _mm_srai_epi16(step2[27], 2);
+ step1[28] = _mm_srai_epi16(step1[28], 2);
+ step1[29] = _mm_srai_epi16(step1[29], 2);
+ step1[30] = _mm_srai_epi16(step1[30], 2);
+ step1[31] = _mm_srai_epi16(step1[31], 2);
+ }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+ if (pass == 0) {
+#endif
// Stage 3
{
step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
step3[31] = _mm_add_epi16(step2[24], step1[31]);
}
-#if !FDCT32x32_HIGH_PRECISION
- // dump the magnitude by half, hence the intermediate values are within
- // the range of 16 bits.
- if (1 == pass) {
- __m128i s3_00_0 = _mm_cmplt_epi16(step3[ 0], kZero);
- __m128i s3_01_0 = _mm_cmplt_epi16(step3[ 1], kZero);
- __m128i s3_02_0 = _mm_cmplt_epi16(step3[ 2], kZero);
- __m128i s3_03_0 = _mm_cmplt_epi16(step3[ 3], kZero);
- __m128i s3_04_0 = _mm_cmplt_epi16(step3[ 4], kZero);
- __m128i s3_05_0 = _mm_cmplt_epi16(step3[ 5], kZero);
- __m128i s3_06_0 = _mm_cmplt_epi16(step3[ 6], kZero);
- __m128i s3_07_0 = _mm_cmplt_epi16(step3[ 7], kZero);
- __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
- __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
- __m128i s3_10_0 = _mm_cmplt_epi16(step3[10], kZero);
- __m128i s3_11_0 = _mm_cmplt_epi16(step3[11], kZero);
- __m128i s3_12_0 = _mm_cmplt_epi16(step3[12], kZero);
- __m128i s3_13_0 = _mm_cmplt_epi16(step3[13], kZero);
- __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
- __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
- __m128i s3_16_0 = _mm_cmplt_epi16(step3[16], kZero);
- __m128i s3_17_0 = _mm_cmplt_epi16(step3[17], kZero);
- __m128i s3_18_0 = _mm_cmplt_epi16(step3[18], kZero);
- __m128i s3_19_0 = _mm_cmplt_epi16(step3[19], kZero);
- __m128i s3_20_0 = _mm_cmplt_epi16(step3[20], kZero);
- __m128i s3_21_0 = _mm_cmplt_epi16(step3[21], kZero);
- __m128i s3_22_0 = _mm_cmplt_epi16(step3[22], kZero);
- __m128i s3_23_0 = _mm_cmplt_epi16(step3[23], kZero);
- __m128i s3_24_0 = _mm_cmplt_epi16(step3[24], kZero);
- __m128i s3_25_0 = _mm_cmplt_epi16(step3[25], kZero);
- __m128i s3_26_0 = _mm_cmplt_epi16(step3[26], kZero);
- __m128i s3_27_0 = _mm_cmplt_epi16(step3[27], kZero);
- __m128i s3_28_0 = _mm_cmplt_epi16(step3[28], kZero);
- __m128i s3_29_0 = _mm_cmplt_epi16(step3[29], kZero);
- __m128i s3_30_0 = _mm_cmplt_epi16(step3[30], kZero);
- __m128i s3_31_0 = _mm_cmplt_epi16(step3[31], kZero);
- step3[ 0] = _mm_sub_epi16(step3[ 0], s3_00_0);
- step3[ 1] = _mm_sub_epi16(step3[ 1], s3_01_0);
- step3[ 2] = _mm_sub_epi16(step3[ 2], s3_02_0);
- step3[ 3] = _mm_sub_epi16(step3[ 3], s3_03_0);
- step3[ 4] = _mm_sub_epi16(step3[ 4], s3_04_0);
- step3[ 5] = _mm_sub_epi16(step3[ 5], s3_05_0);
- step3[ 6] = _mm_sub_epi16(step3[ 6], s3_06_0);
- step3[ 7] = _mm_sub_epi16(step3[ 7], s3_07_0);
- step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
- step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
- step3[10] = _mm_sub_epi16(step3[10], s3_10_0);
- step3[11] = _mm_sub_epi16(step3[11], s3_11_0);
- step3[12] = _mm_sub_epi16(step3[12], s3_12_0);
- step3[13] = _mm_sub_epi16(step3[13], s3_13_0);
- step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
- step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
- step3[16] = _mm_sub_epi16(step3[16], s3_16_0);
- step3[17] = _mm_sub_epi16(step3[17], s3_17_0);
- step3[18] = _mm_sub_epi16(step3[18], s3_18_0);
- step3[19] = _mm_sub_epi16(step3[19], s3_19_0);
- step3[20] = _mm_sub_epi16(step3[20], s3_20_0);
- step3[21] = _mm_sub_epi16(step3[21], s3_21_0);
- step3[22] = _mm_sub_epi16(step3[22], s3_22_0);
- step3[23] = _mm_sub_epi16(step3[23], s3_23_0);
- step3[24] = _mm_sub_epi16(step3[24], s3_24_0);
- step3[25] = _mm_sub_epi16(step3[25], s3_25_0);
- step3[26] = _mm_sub_epi16(step3[26], s3_26_0);
- step3[27] = _mm_sub_epi16(step3[27], s3_27_0);
- step3[28] = _mm_sub_epi16(step3[28], s3_28_0);
- step3[29] = _mm_sub_epi16(step3[29], s3_29_0);
- step3[30] = _mm_sub_epi16(step3[30], s3_30_0);
- step3[31] = _mm_sub_epi16(step3[31], s3_31_0);
- step3[ 0] = _mm_add_epi16(step3[ 0], kOne);
- step3[ 1] = _mm_add_epi16(step3[ 1], kOne);
- step3[ 2] = _mm_add_epi16(step3[ 2], kOne);
- step3[ 3] = _mm_add_epi16(step3[ 3], kOne);
- step3[ 4] = _mm_add_epi16(step3[ 4], kOne);
- step3[ 5] = _mm_add_epi16(step3[ 5], kOne);
- step3[ 6] = _mm_add_epi16(step3[ 6], kOne);
- step3[ 7] = _mm_add_epi16(step3[ 7], kOne);
- step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
- step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
- step3[10] = _mm_add_epi16(step3[10], kOne);
- step3[11] = _mm_add_epi16(step3[11], kOne);
- step3[12] = _mm_add_epi16(step3[12], kOne);
- step3[13] = _mm_add_epi16(step3[13], kOne);
- step2[14] = _mm_add_epi16(step2[14], kOne);
- step2[15] = _mm_add_epi16(step2[15], kOne);
- step3[16] = _mm_add_epi16(step3[16], kOne);
- step3[17] = _mm_add_epi16(step3[17], kOne);
- step3[18] = _mm_add_epi16(step3[18], kOne);
- step3[19] = _mm_add_epi16(step3[19], kOne);
- step3[20] = _mm_add_epi16(step3[20], kOne);
- step3[21] = _mm_add_epi16(step3[21], kOne);
- step3[22] = _mm_add_epi16(step3[22], kOne);
- step3[23] = _mm_add_epi16(step3[23], kOne);
- step3[24] = _mm_add_epi16(step3[24], kOne);
- step3[25] = _mm_add_epi16(step3[25], kOne);
- step3[26] = _mm_add_epi16(step3[26], kOne);
- step3[27] = _mm_add_epi16(step3[27], kOne);
- step3[28] = _mm_add_epi16(step3[28], kOne);
- step3[29] = _mm_add_epi16(step3[29], kOne);
- step3[30] = _mm_add_epi16(step3[30], kOne);
- step3[31] = _mm_add_epi16(step3[31], kOne);
- step3[ 0] = _mm_srai_epi16(step3[ 0], 2);
- step3[ 1] = _mm_srai_epi16(step3[ 1], 2);
- step3[ 2] = _mm_srai_epi16(step3[ 2], 2);
- step3[ 3] = _mm_srai_epi16(step3[ 3], 2);
- step3[ 4] = _mm_srai_epi16(step3[ 4], 2);
- step3[ 5] = _mm_srai_epi16(step3[ 5], 2);
- step3[ 6] = _mm_srai_epi16(step3[ 6], 2);
- step3[ 7] = _mm_srai_epi16(step3[ 7], 2);
- step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
- step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
- step3[10] = _mm_srai_epi16(step3[10], 2);
- step3[11] = _mm_srai_epi16(step3[11], 2);
- step3[12] = _mm_srai_epi16(step3[12], 2);
- step3[13] = _mm_srai_epi16(step3[13], 2);
- step2[14] = _mm_srai_epi16(step2[14], 2);
- step2[15] = _mm_srai_epi16(step2[15], 2);
- step3[16] = _mm_srai_epi16(step3[16], 2);
- step3[17] = _mm_srai_epi16(step3[17], 2);
- step3[18] = _mm_srai_epi16(step3[18], 2);
- step3[19] = _mm_srai_epi16(step3[19], 2);
- step3[20] = _mm_srai_epi16(step3[20], 2);
- step3[21] = _mm_srai_epi16(step3[21], 2);
- step3[22] = _mm_srai_epi16(step3[22], 2);
- step3[23] = _mm_srai_epi16(step3[23], 2);
- step3[24] = _mm_srai_epi16(step3[24], 2);
- step3[25] = _mm_srai_epi16(step3[25], 2);
- step3[26] = _mm_srai_epi16(step3[26], 2);
- step3[27] = _mm_srai_epi16(step3[27], 2);
- step3[28] = _mm_srai_epi16(step3[28], 2);
- step3[29] = _mm_srai_epi16(step3[29], 2);
- step3[30] = _mm_srai_epi16(step3[30], 2);
- step3[31] = _mm_srai_epi16(step3[31], 2);
- }
-#endif
-
-#if FDCT32x32_HIGH_PRECISION
- if (pass == 0) {
-#endif
// Stage 4
{
step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
const __m128i mask16 = _mm_set1_epi32(0x80008000);
const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
// start using 32-bit operations
+ // stage 3
+ {
+ // expanding to 32-bit length priori to addition operations
+ lstep2[ 0] = k_cvtlo_epi16(step2[ 0], mask16, kZero);
+ lstep2[ 1] = k_cvthi_epi16(step2[ 0], mask16, kZero);
+ lstep2[ 2] = k_cvtlo_epi16(step2[ 1], mask16, kZero);
+ lstep2[ 3] = k_cvthi_epi16(step2[ 1], mask16, kZero);
+ lstep2[ 4] = k_cvtlo_epi16(step2[ 2], mask16, kZero);
+ lstep2[ 5] = k_cvthi_epi16(step2[ 2], mask16, kZero);
+ lstep2[ 6] = k_cvtlo_epi16(step2[ 3], mask16, kZero);
+ lstep2[ 7] = k_cvthi_epi16(step2[ 3], mask16, kZero);
+ lstep2[ 8] = k_cvtlo_epi16(step2[ 4], mask16, kZero);
+ lstep2[ 9] = k_cvthi_epi16(step2[ 4], mask16, kZero);
+ lstep2[10] = k_cvtlo_epi16(step2[ 5], mask16, kZero);
+ lstep2[11] = k_cvthi_epi16(step2[ 5], mask16, kZero);
+ lstep2[12] = k_cvtlo_epi16(step2[ 6], mask16, kZero);
+ lstep2[13] = k_cvthi_epi16(step2[ 6], mask16, kZero);
+ lstep2[14] = k_cvtlo_epi16(step2[ 7], mask16, kZero);
+ lstep2[15] = k_cvthi_epi16(step2[ 7], mask16, kZero);
+
+ lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
+ lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
+ lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
+ lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
+ lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
+ lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
+ lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
+ lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
+ lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
+ lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
+ lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
+ lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
+ lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
+ lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
+ lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
+ lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
+ }
+ {
+ const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+ const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+ const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+ const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+ const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+ const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+ const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+ const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+ const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+ const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+ const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+ const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+ const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+ lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+ lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+ lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+ lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+ lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+ lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+ lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+ lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+ }
+ {
+ lstep2[40] = k_cvtlo_epi16(step2[20], mask16, kZero);
+ lstep2[41] = k_cvthi_epi16(step2[20], mask16, kZero);
+ lstep2[42] = k_cvtlo_epi16(step2[21], mask16, kZero);
+ lstep2[43] = k_cvthi_epi16(step2[21], mask16, kZero);
+ lstep2[44] = k_cvtlo_epi16(step2[22], mask16, kZero);
+ lstep2[45] = k_cvthi_epi16(step2[22], mask16, kZero);
+ lstep2[46] = k_cvtlo_epi16(step2[23], mask16, kZero);
+ lstep2[47] = k_cvthi_epi16(step2[23], mask16, kZero);
+ lstep2[48] = k_cvtlo_epi16(step2[24], mask16, kZero);
+ lstep2[49] = k_cvthi_epi16(step2[24], mask16, kZero);
+ lstep2[50] = k_cvtlo_epi16(step2[25], mask16, kZero);
+ lstep2[51] = k_cvthi_epi16(step2[25], mask16, kZero);
+ lstep2[52] = k_cvtlo_epi16(step2[26], mask16, kZero);
+ lstep2[53] = k_cvthi_epi16(step2[26], mask16, kZero);
+ lstep2[54] = k_cvtlo_epi16(step2[27], mask16, kZero);
+ lstep2[55] = k_cvthi_epi16(step2[27], mask16, kZero);
+
+ lstep1[32] = k_cvtlo_epi16(step1[16], mask16, kZero);
+ lstep1[33] = k_cvthi_epi16(step1[16], mask16, kZero);
+ lstep1[34] = k_cvtlo_epi16(step1[17], mask16, kZero);
+ lstep1[35] = k_cvthi_epi16(step1[17], mask16, kZero);
+ lstep1[36] = k_cvtlo_epi16(step1[18], mask16, kZero);
+ lstep1[37] = k_cvthi_epi16(step1[18], mask16, kZero);
+ lstep1[38] = k_cvtlo_epi16(step1[19], mask16, kZero);
+ lstep1[39] = k_cvthi_epi16(step1[19], mask16, kZero);
+ lstep1[56] = k_cvtlo_epi16(step1[28], mask16, kZero);
+ lstep1[57] = k_cvthi_epi16(step1[28], mask16, kZero);
+ lstep1[58] = k_cvtlo_epi16(step1[29], mask16, kZero);
+ lstep1[59] = k_cvthi_epi16(step1[29], mask16, kZero);
+ lstep1[60] = k_cvtlo_epi16(step1[30], mask16, kZero);
+ lstep1[61] = k_cvthi_epi16(step1[30], mask16, kZero);
+ lstep1[62] = k_cvtlo_epi16(step1[31], mask16, kZero);
+ lstep1[63] = k_cvthi_epi16(step1[31], mask16, kZero);
+
+ lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
+ lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
+ lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
+ lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
+ lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
+ lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
+ lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
+ lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
+ lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
+ lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
+ lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
+ lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
+ lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
+ lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
+ lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
+ lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
+ lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
+ lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
+ lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
+ lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
+ lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
+ lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
+ lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
+ lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
+ lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
+ lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
+ lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
+ lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
+ lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
+ lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
+ lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
+ lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+ }
+
// stage 4
{
// expanding to 32-bit length priori to addition operations
- lstep3[ 0] = k_cvtlo_epi16(step3[ 0], mask16, kZero);
- lstep3[ 1] = k_cvthi_epi16(step3[ 0], mask16, kZero);
- lstep3[ 2] = k_cvtlo_epi16(step3[ 1], mask16, kZero);
- lstep3[ 3] = k_cvthi_epi16(step3[ 1], mask16, kZero);
- lstep3[ 4] = k_cvtlo_epi16(step3[ 2], mask16, kZero);
- lstep3[ 5] = k_cvthi_epi16(step3[ 2], mask16, kZero);
- lstep3[ 6] = k_cvtlo_epi16(step3[ 3], mask16, kZero);
- lstep3[ 7] = k_cvthi_epi16(step3[ 3], mask16, kZero);
- lstep3[20] = k_cvtlo_epi16(step3[10], mask16, kZero);
- lstep3[21] = k_cvthi_epi16(step3[10], mask16, kZero);
- lstep3[22] = k_cvtlo_epi16(step3[11], mask16, kZero);
- lstep3[23] = k_cvthi_epi16(step3[11], mask16, kZero);
- lstep3[24] = k_cvtlo_epi16(step3[12], mask16, kZero);
- lstep3[25] = k_cvthi_epi16(step3[12], mask16, kZero);
- lstep3[26] = k_cvtlo_epi16(step3[13], mask16, kZero);
- lstep3[27] = k_cvthi_epi16(step3[13], mask16, kZero);
lstep2[16] = k_cvtlo_epi16(step2[ 8], mask16, kZero);
lstep2[17] = k_cvthi_epi16(step2[ 8], mask16, kZero);
lstep2[18] = k_cvtlo_epi16(step2[ 9], mask16, kZero);
lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
}
{
- const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
- const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
- const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
- const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
- const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
- const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
- lstep1[10] = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
- lstep1[11] = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
- lstep1[12] = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
- lstep1[13] = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+ // to be continued...
+ //
+ const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
+ const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
+
+ u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
+ u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
+ u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
+ u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+ // TODO(jingning): manually inline k_madd_epi32_ to further hide
+ // instruction latency.
+ v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
+ v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
+ v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
+ v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
+ v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
+ v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
+ v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
+ v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
+
+ u[0] = k_packs_epi64(v[0], v[1]);
+ u[1] = k_packs_epi64(v[2], v[3]);
+ u[2] = k_packs_epi64(v[4], v[5]);
+ u[3] = k_packs_epi64(v[6], v[7]);
+
+ v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+ v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+ v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+ v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+ lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+ lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+ lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+ lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
}
{
- const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
- const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
- const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
- const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
- const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
- const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
- const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
- const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
- const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
- const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
- const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
- const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
- const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
- const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
- const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
- const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
- const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
- const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
- const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
- const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
- const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
- const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
- const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
- const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
- // dct_const_round_shift
- const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
- const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
- lstep1[36] = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
- lstep1[37] = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
- lstep1[38] = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
- lstep1[39] = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
- lstep1[40] = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
- lstep1[41] = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
- lstep1[42] = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
- lstep1[43] = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
- lstep1[52] = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
- lstep1[53] = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
- lstep1[54] = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
- lstep1[55] = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
- lstep1[56] = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
- lstep1[57] = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
- lstep1[58] = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
- lstep1[59] = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+ const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
+ const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
+ const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
+
+ u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
+ u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
+ u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
+ u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
+ u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
+ u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
+ u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
+ u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
+ u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
+ u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
+ u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
+ u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
+ u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
+ u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
+ u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
+ u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+ v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
+ v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
+ v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
+ v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
+ v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
+ v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
+ v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
+ v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
+ v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
+ v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
+ v[10] = k_madd_epi32(u[10], k32_m24_m08);
+ v[11] = k_madd_epi32(u[11], k32_m24_m08);
+ v[12] = k_madd_epi32(u[12], k32_m24_m08);
+ v[13] = k_madd_epi32(u[13], k32_m24_m08);
+ v[14] = k_madd_epi32(u[14], k32_m24_m08);
+ v[15] = k_madd_epi32(u[15], k32_m24_m08);
+ v[16] = k_madd_epi32(u[12], k32_m08_p24);
+ v[17] = k_madd_epi32(u[13], k32_m08_p24);
+ v[18] = k_madd_epi32(u[14], k32_m08_p24);
+ v[19] = k_madd_epi32(u[15], k32_m08_p24);
+ v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
+ v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
+ v[22] = k_madd_epi32(u[10], k32_m08_p24);
+ v[23] = k_madd_epi32(u[11], k32_m08_p24);
+ v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
+ v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
+ v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
+ v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
+ v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
+ v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
+ v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
+ v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
+
+ u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
+ u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
+ u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
+ u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
+ u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
+ u[ 5] = k_packs_epi64(v[10], v[11]);
+ u[ 6] = k_packs_epi64(v[12], v[13]);
+ u[ 7] = k_packs_epi64(v[14], v[15]);
+ u[ 8] = k_packs_epi64(v[16], v[17]);
+ u[ 9] = k_packs_epi64(v[18], v[19]);
+ u[10] = k_packs_epi64(v[20], v[21]);
+ u[11] = k_packs_epi64(v[22], v[23]);
+ u[12] = k_packs_epi64(v[24], v[25]);
+ u[13] = k_packs_epi64(v[26], v[27]);
+ u[14] = k_packs_epi64(v[28], v[29]);
+ u[15] = k_packs_epi64(v[30], v[31]);
+
+ v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+ v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+ v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+ v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+ v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+ v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+ v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+ v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+ v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+ v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+ v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+ v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+ v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+ v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+ v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+ v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+ lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
+ lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
+ lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
+ lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
+ lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
+ lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
+ lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
+ lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
+ lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
+ lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
+ lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+ lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+ lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+ lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+ lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+ lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
}
// stage 5
{
- lstep3[ 8] = k_cvtlo_epi16(step3[4], mask16, kZero);
- lstep3[ 9] = k_cvthi_epi16(step3[4], mask16, kZero);
- lstep3[14] = k_cvtlo_epi16(step3[7], mask16, kZero);
- lstep3[15] = k_cvthi_epi16(step3[7], mask16, kZero);
-
lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
}
{
- lstep3[32] = k_cvtlo_epi16(step3[16], mask16, kZero);
- lstep3[33] = k_cvthi_epi16(step3[16], mask16, kZero);
- lstep3[34] = k_cvtlo_epi16(step3[17], mask16, kZero);
- lstep3[35] = k_cvthi_epi16(step3[17], mask16, kZero);
- lstep3[44] = k_cvtlo_epi16(step3[22], mask16, kZero);
- lstep3[45] = k_cvthi_epi16(step3[22], mask16, kZero);
- lstep3[46] = k_cvtlo_epi16(step3[23], mask16, kZero);
- lstep3[47] = k_cvthi_epi16(step3[23], mask16, kZero);
- lstep3[48] = k_cvtlo_epi16(step3[24], mask16, kZero);
- lstep3[49] = k_cvthi_epi16(step3[24], mask16, kZero);
- lstep3[50] = k_cvtlo_epi16(step3[25], mask16, kZero);
- lstep3[51] = k_cvthi_epi16(step3[25], mask16, kZero);
- lstep3[60] = k_cvtlo_epi16(step3[30], mask16, kZero);
- lstep3[61] = k_cvthi_epi16(step3[30], mask16, kZero);
- lstep3[62] = k_cvtlo_epi16(step3[31], mask16, kZero);
- lstep3[63] = k_cvthi_epi16(step3[31], mask16, kZero);
-
lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);