// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
- transpose_s16_8x8_new(&temp0[0], &temp2[0]);
- transpose_s16_8x8_new(&temp1[0], &temp2[8]);
+ transpose_s16_8x8q(&temp0[0], &temp2[0]);
+ transpose_s16_8x8q(&temp1[0], &temp2[8]);
partial_round_shift(temp2);
cross_input(temp2, temp3);
vpx_fdct8x16_body(temp3, temp2);
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
- transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+ transpose_s16_8x8q(&temp0[8], &temp1[0]);
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
&temp1[13], &temp1[14], &temp1[15]);
dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
- transpose_s16_8x8_new(&temp1[0], &temp0[0]);
- transpose_s16_8x8_new(&temp2[0], &temp0[8]);
- transpose_s16_8x8_new(&temp3[0], &temp0[16]);
- transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+ transpose_s16_8x8q(&temp1[0], &temp0[0]);
+ transpose_s16_8x8q(&temp2[0], &temp0[8]);
+ transpose_s16_8x8q(&temp3[0], &temp0[16]);
+ transpose_s16_8x8q(&temp4[0], &temp0[24]);
dct_body_second_pass(temp0, temp5);
store(output, temp5);
// Second row of 8x32.
- transpose_s16_8x8_new(&temp1[8], &temp0[0]);
- transpose_s16_8x8_new(&temp2[8], &temp0[8]);
- transpose_s16_8x8_new(&temp3[8], &temp0[16]);
- transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+ transpose_s16_8x8q(&temp1[8], &temp0[0]);
+ transpose_s16_8x8q(&temp2[8], &temp0[8]);
+ transpose_s16_8x8q(&temp3[8], &temp0[16]);
+ transpose_s16_8x8q(&temp4[8], &temp0[24]);
dct_body_second_pass(temp0, temp5);
store(output + 8 * 32, temp5);
// Third row of 8x32
- transpose_s16_8x8_new(&temp1[16], &temp0[0]);
- transpose_s16_8x8_new(&temp2[16], &temp0[8]);
- transpose_s16_8x8_new(&temp3[16], &temp0[16]);
- transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+ transpose_s16_8x8q(&temp1[16], &temp0[0]);
+ transpose_s16_8x8q(&temp2[16], &temp0[8]);
+ transpose_s16_8x8q(&temp3[16], &temp0[16]);
+ transpose_s16_8x8q(&temp4[16], &temp0[24]);
dct_body_second_pass(temp0, temp5);
store(output + 16 * 32, temp5);
// Final row of 8x32.
- transpose_s16_8x8_new(&temp1[24], &temp0[0]);
- transpose_s16_8x8_new(&temp2[24], &temp0[8]);
- transpose_s16_8x8_new(&temp3[24], &temp0[16]);
- transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+ transpose_s16_8x8q(&temp1[24], &temp0[0]);
+ transpose_s16_8x8q(&temp2[24], &temp0[8]);
+ transpose_s16_8x8q(&temp3[24], &temp0[16]);
+ transpose_s16_8x8q(&temp4[24], &temp0[24]);
dct_body_second_pass(temp0, temp5);
dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
- transpose_s16_8x8_new(&temp1[0], &temp0[0]);
- transpose_s16_8x8_new(&temp2[0], &temp0[8]);
- transpose_s16_8x8_new(&temp3[0], &temp0[16]);
- transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+ transpose_s16_8x8q(&temp1[0], &temp0[0]);
+ transpose_s16_8x8q(&temp2[0], &temp0[8]);
+ transpose_s16_8x8q(&temp3[0], &temp0[16]);
+ transpose_s16_8x8q(&temp4[0], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
store(output, temp5);
// Second row of 8x32.
- transpose_s16_8x8_new(&temp1[8], &temp0[0]);
- transpose_s16_8x8_new(&temp2[8], &temp0[8]);
- transpose_s16_8x8_new(&temp3[8], &temp0[16]);
- transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+ transpose_s16_8x8q(&temp1[8], &temp0[0]);
+ transpose_s16_8x8q(&temp2[8], &temp0[8]);
+ transpose_s16_8x8q(&temp3[8], &temp0[16]);
+ transpose_s16_8x8q(&temp4[8], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
store(output + 8 * 32, temp5);
// Third row of 8x32
- transpose_s16_8x8_new(&temp1[16], &temp0[0]);
- transpose_s16_8x8_new(&temp2[16], &temp0[8]);
- transpose_s16_8x8_new(&temp3[16], &temp0[16]);
- transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+ transpose_s16_8x8q(&temp1[16], &temp0[0]);
+ transpose_s16_8x8q(&temp2[16], &temp0[8]);
+ transpose_s16_8x8q(&temp3[16], &temp0[16]);
+ transpose_s16_8x8q(&temp4[16], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
store(output + 16 * 32, temp5);
// Final row of 8x32.
- transpose_s16_8x8_new(&temp1[24], &temp0[0]);
- transpose_s16_8x8_new(&temp2[24], &temp0[8]);
- transpose_s16_8x8_new(&temp3[24], &temp0[16]);
- transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+ transpose_s16_8x8q(&temp1[24], &temp0[0]);
+ transpose_s16_8x8q(&temp2[24], &temp0[8]);
+ transpose_s16_8x8q(&temp3[24], &temp0[16]);
+ transpose_s16_8x8q(&temp4[24], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
// b0.val[1]: 04 05 06 07 20 21 22 23
static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
int16x8x2_t b0;
+#if defined(__aarch64__)
+ b0.val[0] = vreinterpretq_s16_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s16_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
return b0;
}
static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
uint16x8x2_t b0;
+#if defined(__aarch64__)
+ b0.val[0] = vreinterpretq_u16_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u16_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
vreinterpret_u16_u32(vget_low_u32(a1)));
b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
return b0;
}
}
// Transpose 8x8 to a new location.
-static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
- // Swap 16 bit elements.
- const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
- const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
- const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
- const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
- // Swap 32 bit elements.
- const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
- vreinterpretq_s32_s16(c1.val[0]));
- const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
- vreinterpretq_s32_s16(c1.val[1]));
- const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
- vreinterpretq_s32_s16(c3.val[0]));
- const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
- vreinterpretq_s32_s16(c3.val[1]));
-
- // Swap 64 bit elements
- const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
- const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
- const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
- const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
- b[0] = e0.val[0];
- b[1] = e1.val[0];
- b[2] = e2.val[0];
- b[3] = e3.val[0];
- b[4] = e0.val[1];
- b[5] = e1.val[1];
- b[6] = e2.val[1];
- b[7] = e3.val[1];
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ out[0] = d0.val[0];
+ out[1] = d1.val[0];
+ out[2] = d2.val[0];
+ out[3] = d3.val[0];
+ out[4] = d0.val[1];
+ out[5] = d1.val[1];
+ out[6] = d2.val[1];
+ out[7] = d3.val[1];
}
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
+
const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
+
const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);