const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan_ptr,
- const int16_t *iscan_ptr) {
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
const int16x8_t one = vdupq_n_s16(1);
const int16x8_t neg_one = vdupq_n_s16(-1);
uint16x8_t eob_max;
- (void)scan_ptr;
+ (void)scan;
(void)skip_block;
assert(!skip_block);
const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
const int16x8_t dequant = vld1q_s16(dequant_ptr);
// Add one because the eob does not index from 0.
- const uint16x8_t iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+ const uint16x8_t v_iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
qcoeff = vandq_s16(qcoeff, zbin_mask);
// Set non-zero elements to -1 and use that to extract values for eob.
- eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
coeff_ptr += 8;
- iscan_ptr += 8;
+ iscan += 8;
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
do {
// Add one because the eob is not its index.
- const uint16x8_t iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+ const uint16x8_t v_iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max =
- vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
coeff_ptr += 8;
- iscan_ptr += 8;
+ iscan += 8;
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
// Main difference is that zbin values are halved before comparison and dqcoeff
// values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
const int16x8_t one = vdupq_n_s16(1);
const int16x8_t neg_one = vdupq_n_s16(-1);
uint16x8_t eob_max;
int i;
- (void)scan_ptr;
+ (void)scan;
(void)n_coeffs; // Because we will always calculate 32*32.
(void)skip_block;
assert(!skip_block);
const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
const int16x8_t dequant = vld1q_s16(dequant_ptr);
// Add one because the eob does not index from 0.
- const uint16x8_t iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+ const uint16x8_t v_iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
qcoeff = vandq_s16(qcoeff, zbin_mask);
// Set non-zero elements to -1 and use that to extract values for eob.
- eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+ eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
coeff_ptr += 8;
- iscan_ptr += 8;
+ iscan += 8;
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
for (i = 1; i < 32 * 32 / 8; ++i) {
// Add one because the eob is not its index.
- const uint16x8_t iscan =
- vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+ const uint16x8_t v_iscan =
+ vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
// Set non-zero elements to -1 and use that to extract values for eob.
eob_max =
- vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+ vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
coeff_ptr += 8;
- iscan_ptr += 8;
+ iscan += 8;
store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
qcoeff_ptr += 8;
return vreinterpret_u8_u32(aa);
}
-static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,
- const uint8_t *const ref[4], const int ref_stride,
- const int height, uint32_t *const res) {
+static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
+ const uint8_t *const ref_array[4],
+ const int ref_stride, const int height,
+ uint32_t *const res) {
int i;
uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
uint16x4_t a[2];
uint32x4_t r;
- assert(!((intptr_t)src % sizeof(uint32_t)));
+ assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
assert(!(src_stride % sizeof(uint32_t)));
for (i = 0; i < height; ++i) {
const uint8x8_t s = vreinterpret_u8_u32(
- vld1_dup_u32((const uint32_t *)(src + i * src_stride)));
- const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride,
- ref[1] + i * ref_stride);
- const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride,
- ref[3] + i * ref_stride);
+ vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
+ const uint8x8_t ref01 = load_unaligned_2_buffers(
+ ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
+ const uint8x8_t ref23 = load_unaligned_2_buffers(
+ ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
abs[0] = vabal_u8(abs[0], s, ref01);
abs[1] = vabal_u8(abs[1], s, ref23);
}
vst1q_u32(res, r);
}
-void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad4x_4d(src, src_stride, ref, ref_stride, 4, res);
+ sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
}
-void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad4x_4d(src, src_stride, ref, ref_stride, 8, res);
+ sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
}
////////////////////////////////////////////////////////////////////////////////
vst1q_u32(res, vcombine_u32(d0, d1));
}
-static INLINE void sad8x_4d(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res, const int height) {
int i, j;
- const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+ const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+ ref_array[3] };
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
for (i = 0; i < height; ++i) {
- const uint8x8_t s = vld1_u8(src);
- src += src_stride;
+ const uint8x8_t s = vld1_u8(src_ptr);
+ src_ptr += src_stride;
for (j = 0; j < 4; ++j) {
const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
ref_loop[j] += ref_stride;
sad_512_pel_final_neon(sum, res);
}
-void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad8x_4d(src, src_stride, ref, ref_stride, res, 4);
+ sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
}
-void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad8x_4d(src, src_stride, ref, ref_stride, res, 8);
+ sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
}
-void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
+ sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
}
////////////////////////////////////////////////////////////////////////////////
-static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src,
+static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint16x8_t *const sum) {
- const uint8x16_t r = vld1q_u8(ref);
- *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r));
- *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r));
+ const uint8x16_t r = vld1q_u8(ref_ptr);
+ *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
+ *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
}
-static INLINE void sad16x_4d(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res, const int height) {
int i, j;
- const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+ const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+ ref_array[3] };
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
for (i = 0; i < height; ++i) {
- const uint8x16_t s = vld1q_u8(src);
- src += src_stride;
+ const uint8x16_t s = vld1q_u8(src_ptr);
+ src_ptr += src_stride;
for (j = 0; j < 4; ++j) {
sad16_neon(ref_loop[j], s, &sum[j]);
ref_loop[j] += ref_stride;
sad_512_pel_final_neon(sum, res);
}
-void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
+ sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
}
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
+ sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
}
-void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
- sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
+ sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
}
////////////////////////////////////////////////////////////////////////////////
-static INLINE void sad32x_4d(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
const int height, uint16x8_t *const sum) {
int i;
- const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+ const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+ ref_array[3] };
sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
uint8x16_t s;
- s = vld1q_u8(src + 0 * 16);
+ s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
- s = vld1q_u8(src + 1 * 16);
+ s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
- src += src_stride;
+ src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
}
}
-void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
uint16x8_t sum[4];
- sad32x_4d(src, src_stride, ref, ref_stride, 16, sum);
+ sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
sad_512_pel_final_neon(sum, res);
}
-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
uint16x8_t sum[4];
- sad32x_4d(src, src_stride, ref, ref_stride, 32, sum);
+ sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
sad_1024_pel_final_neon(sum, res);
}
-void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
uint16x8_t sum[4];
- sad32x_4d(src, src_stride, ref, ref_stride, 64, sum);
+ sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
sad_2048_pel_final_neon(sum, res);
}
////////////////////////////////////////////////////////////////////////////////
-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
int i;
- const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+ const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+ ref_array[3] };
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
for (i = 0; i < 32; ++i) {
uint8x16_t s;
- s = vld1q_u8(src + 0 * 16);
+ s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
- s = vld1q_u8(src + 1 * 16);
+ s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
- s = vld1q_u8(src + 2 * 16);
+ s = vld1q_u8(src_ptr + 2 * 16);
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
- s = vld1q_u8(src + 3 * 16);
+ s = vld1q_u8(src_ptr + 3 * 16);
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
- src += src_stride;
+ src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
sad_2048_pel_final_neon(sum, res);
}
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t *res) {
int i;
- const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+ const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+ ref_array[3] };
uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0), vdupq_n_u16(0) };
for (i = 0; i < 64; ++i) {
uint8x16_t s;
- s = vld1q_u8(src + 0 * 16);
+ s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
- s = vld1q_u8(src + 1 * 16);
+ s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
- s = vld1q_u8(src + 2 * 16);
+ s = vld1q_u8(src_ptr + 2 * 16);
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
- s = vld1q_u8(src + 3 * 16);
+ s = vld1q_u8(src_ptr + 3 * 16);
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
- src += src_stride;
+ src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
}
-static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, const int height) {
+static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int height) {
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x8_t a_u8 = vld1_u8(a);
- const uint8x8_t b_u8 = vld1_u8(b);
- a += a_stride;
- b += b_stride;
+ const uint8x8_t a_u8 = vld1_u8(src_ptr);
+ const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
abs = vabal_u8(abs, a_u8, b_u8);
}
return abs;
}
-static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x8_t a_u8 = vld1_u8(a);
- const uint8x8_t b_u8 = vld1_u8(b);
- const uint8x8_t c_u8 = vld1_u8(c);
+ const uint8x8_t a_u8 = vld1_u8(src_ptr);
+ const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+ const uint8x8_t c_u8 = vld1_u8(second_pred);
const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
- a += a_stride;
- b += b_stride;
- c += 8;
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 8;
abs = vabal_u8(abs, a_u8, avg);
}
return abs;
}
-#define sad8xN(n) \
- uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
- return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
- } \
- \
- uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- const uint16x8_t abs = \
- sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
- return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+#define sad8xN(n) \
+ uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint16x8_t abs = \
+ sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
}
sad8xN(4);
sad8xN(8);
sad8xN(16);
-static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
const int height) {
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x16_t a_u8 = vld1q_u8(a);
- const uint8x16_t b_u8 = vld1q_u8(b);
- a += a_stride;
- b += b_stride;
+ const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+ const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
}
return abs;
}
-static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x16_t a_u8 = vld1q_u8(a);
- const uint8x16_t b_u8 = vld1q_u8(b);
- const uint8x16_t c_u8 = vld1q_u8(c);
+ const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+ const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+ const uint8x16_t c_u8 = vld1q_u8(second_pred);
const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
- a += a_stride;
- b += b_stride;
- c += 16;
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 16;
abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
}
return abs;
}
-#define sad16xN(n) \
- uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
- return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
- } \
- \
- uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- const uint16x8_t abs = \
- sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
- return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+#define sad16xN(n) \
+ uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint16x8_t abs = \
+ sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint16x8_t abs = \
+ sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
}
sad16xN(8);
sad16xN(16);
sad16xN(32);
-static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
const int height) {
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x16_t a_lo = vld1q_u8(a);
- const uint8x16_t a_hi = vld1q_u8(a + 16);
- const uint8x16_t b_lo = vld1q_u8(b);
- const uint8x16_t b_hi = vld1q_u8(b + 16);
- a += a_stride;
- b += b_stride;
+ const uint8x16_t a_lo = vld1q_u8(src_ptr);
+ const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+ const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+ const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
return abs;
}
-static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
int i;
uint16x8_t abs = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x16_t a_lo = vld1q_u8(a);
- const uint8x16_t a_hi = vld1q_u8(a + 16);
- const uint8x16_t b_lo = vld1q_u8(b);
- const uint8x16_t b_hi = vld1q_u8(b + 16);
- const uint8x16_t c_lo = vld1q_u8(c);
- const uint8x16_t c_hi = vld1q_u8(c + 16);
+ const uint8x16_t a_lo = vld1q_u8(src_ptr);
+ const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+ const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+ const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t c_lo = vld1q_u8(second_pred);
+ const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
- a += a_stride;
- b += b_stride;
- c += 32;
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 32;
abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
return abs;
}
-#define sad32xN(n) \
- uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
- return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
- } \
- \
- uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- const uint16x8_t abs = \
- sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
- return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+#define sad32xN(n) \
+ uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint16x8_t abs = \
+ sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint16x8_t abs = \
+ sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint16x8(abs), 0); \
}
sad32xN(16);
sad32xN(32);
sad32xN(64);
-static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
const int height) {
int i;
uint16x8_t abs_0 = vdupq_n_u16(0);
uint16x8_t abs_1 = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x16_t a_0 = vld1q_u8(a);
- const uint8x16_t a_1 = vld1q_u8(a + 16);
- const uint8x16_t a_2 = vld1q_u8(a + 32);
- const uint8x16_t a_3 = vld1q_u8(a + 48);
- const uint8x16_t b_0 = vld1q_u8(b);
- const uint8x16_t b_1 = vld1q_u8(b + 16);
- const uint8x16_t b_2 = vld1q_u8(b + 32);
- const uint8x16_t b_3 = vld1q_u8(b + 48);
- a += a_stride;
- b += b_stride;
+ const uint8x16_t a_0 = vld1q_u8(src_ptr);
+ const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+ const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+ const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+ const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+ const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+ const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
}
}
-static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *c, const int height) {
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const int height) {
int i;
uint16x8_t abs_0 = vdupq_n_u16(0);
uint16x8_t abs_1 = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x16_t a_0 = vld1q_u8(a);
- const uint8x16_t a_1 = vld1q_u8(a + 16);
- const uint8x16_t a_2 = vld1q_u8(a + 32);
- const uint8x16_t a_3 = vld1q_u8(a + 48);
- const uint8x16_t b_0 = vld1q_u8(b);
- const uint8x16_t b_1 = vld1q_u8(b + 16);
- const uint8x16_t b_2 = vld1q_u8(b + 32);
- const uint8x16_t b_3 = vld1q_u8(b + 48);
- const uint8x16_t c_0 = vld1q_u8(c);
- const uint8x16_t c_1 = vld1q_u8(c + 16);
- const uint8x16_t c_2 = vld1q_u8(c + 32);
- const uint8x16_t c_3 = vld1q_u8(c + 48);
+ const uint8x16_t a_0 = vld1q_u8(src_ptr);
+ const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+ const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+ const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+ const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+ const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+ const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+ const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+ const uint8x16_t c_0 = vld1q_u8(second_pred);
+ const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+ const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+ const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
- a += a_stride;
- b += b_stride;
- c += 64;
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ second_pred += 64;
abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
}
}
-#define sad64xN(n) \
- uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
- return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
- } \
- \
- uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- const uint32x4_t abs = \
- sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n); \
- return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
+#define sad64xN(n) \
+ uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ const uint32x4_t abs = \
+ sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+ return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
+ } \
+ \
+ uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ const uint8_t *second_pred) { \
+ const uint32x4_t abs = \
+ sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+ return vget_lane_u32(horizontal_add_uint32x4(abs), 0); \
}
sad64xN(32);
uint8_t *buffer;
} vpx_writer;
-void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
-void vpx_stop_encode(vpx_writer *bc);
+void vpx_start_encode(vpx_writer *br, uint8_t *source);
+void vpx_stop_encode(vpx_writer *br);
static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
unsigned int split;
9, 10, 13,
};
-void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
- unsigned char *dst_ptr,
- int src_pixels_per_line,
- int dst_pixels_per_line, int cols,
- unsigned char *f, int size) {
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src,
+ unsigned char *dst, int src_pitch,
+ int dst_pitch, int cols,
+ unsigned char *flimits, int size) {
unsigned char *p_src, *p_dst;
int row;
int col;
for (row = 0; row < size; row++) {
/* post_proc_down for one row */
- p_src = src_ptr;
- p_dst = dst_ptr;
+ p_src = src;
+ p_dst = dst;
for (col = 0; col < cols; col++) {
- unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
- unsigned char p_above1 = p_src[col - src_pixels_per_line];
- unsigned char p_below1 = p_src[col + src_pixels_per_line];
- unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+ unsigned char p_above2 = p_src[col - 2 * src_pitch];
+ unsigned char p_above1 = p_src[col - src_pitch];
+ unsigned char p_below1 = p_src[col + src_pitch];
+ unsigned char p_below2 = p_src[col + 2 * src_pitch];
v = p_src[col];
- if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
- (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+ if ((abs(v - p_above2) < flimits[col]) &&
+ (abs(v - p_above1) < flimits[col]) &&
+ (abs(v - p_below1) < flimits[col]) &&
+ (abs(v - p_below2) < flimits[col])) {
unsigned char k1, k2, k3;
k1 = (p_above2 + p_above1 + 1) >> 1;
k2 = (p_below2 + p_below1 + 1) >> 1;
}
/* now post_proc_across */
- p_src = dst_ptr;
- p_dst = dst_ptr;
+ p_src = dst;
+ p_dst = dst;
p_src[-2] = p_src[-1] = p_src[0];
p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
for (col = 0; col < cols; col++) {
v = p_src[col];
- if ((abs(v - p_src[col - 2]) < f[col]) &&
- (abs(v - p_src[col - 1]) < f[col]) &&
- (abs(v - p_src[col + 1]) < f[col]) &&
- (abs(v - p_src[col + 2]) < f[col])) {
+ if ((abs(v - p_src[col - 2]) < flimits[col]) &&
+ (abs(v - p_src[col - 1]) < flimits[col]) &&
+ (abs(v - p_src[col + 1]) < flimits[col]) &&
+ (abs(v - p_src[col + 2]) < flimits[col])) {
unsigned char k1, k2, k3;
k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
p_dst[col - 1] = d[(col - 1) & 3];
/* next row */
- src_ptr += src_pixels_per_line;
- dst_ptr += dst_pixels_per_line;
+ src += src_pitch;
+ dst += dst_pitch;
}
}
output[0] = sum * 2;
}
-void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) {
int i, j;
tran_low_t intermediate[64];
int pass;
- tran_low_t *output = intermediate;
+ tran_low_t *out = intermediate;
const tran_low_t *in = NULL;
// Transform columns
t1 = (x0 - x1) * cospi_16_64;
t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
- output[0] = (tran_low_t)fdct_round_shift(t0);
- output[2] = (tran_low_t)fdct_round_shift(t2);
- output[4] = (tran_low_t)fdct_round_shift(t1);
- output[6] = (tran_low_t)fdct_round_shift(t3);
+ out[0] = (tran_low_t)fdct_round_shift(t0);
+ out[2] = (tran_low_t)fdct_round_shift(t2);
+ out[4] = (tran_low_t)fdct_round_shift(t1);
+ out[6] = (tran_low_t)fdct_round_shift(t3);
// Stage 2
t0 = (s6 - s5) * cospi_16_64;
t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- output[1] = (tran_low_t)fdct_round_shift(t0);
- output[3] = (tran_low_t)fdct_round_shift(t2);
- output[5] = (tran_low_t)fdct_round_shift(t1);
- output[7] = (tran_low_t)fdct_round_shift(t3);
- output += 8;
+ out[1] = (tran_low_t)fdct_round_shift(t0);
+ out[3] = (tran_low_t)fdct_round_shift(t2);
+ out[5] = (tran_low_t)fdct_round_shift(t1);
+ out[7] = (tran_low_t)fdct_round_shift(t3);
+ out += 8;
}
in = intermediate;
- output = final_output;
+ out = output;
}
// Rows
for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+ for (j = 0; j < 8; ++j) output[j + i * 8] /= 2;
}
}
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
-void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) {
int i, j;
- tran_high_t output[32 * 32];
+ tran_high_t out[32 * 32];
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
- output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
// Rows
for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
vpx_fdct32(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
- out[j + i * 32] =
+ output[j + i * 32] =
(tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
}
}
// Note that although we use dct_32_round in dct32 computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
-void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) {
int i, j;
- tran_high_t output[32 * 32];
+ tran_high_t out[32 * 32];
// Columns
for (i = 0; i < 32; ++i) {
// TODO(cd): see quality impact of only doing
// output[j * 32 + i] = (temp_out[j] + 1) >> 2;
// PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
- output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
// Rows
for (i = 0; i < 32; ++i) {
tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+ for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
vpx_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+ for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j];
}
}
vpx_fdct4x4_c(input, output, stride);
}
-void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output,
int stride) {
- vpx_fdct8x8_c(input, final_output, stride);
+ vpx_fdct8x8_c(input, output, stride);
}
-void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output,
int stride) {
- vpx_fdct8x8_1_c(input, final_output, stride);
+ vpx_fdct8x8_1_c(input, output, stride);
}
void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
vpx_fdct16x16_1_c(input, output, stride);
}
-void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
- vpx_fdct32x32_c(input, out, stride);
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vpx_fdct32x32_c(input, output, stride);
}
-void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output,
int stride) {
- vpx_fdct32x32_rd_c(input, out, stride);
+ vpx_fdct32x32_rd_c(input, output, stride);
}
-void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output,
int stride) {
- vpx_fdct32x32_1_c(input, out, stride);
+ vpx_fdct32x32_1_c(input, output, stride);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i;
tran_high_t a1, e1;
tran_low_t tmp[4];
- const tran_low_t *ip = in;
+ const tran_low_t *ip = input;
tran_low_t *op = tmp;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
}
}
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
tran_high_t a1, e1;
tran_low_t tmp[4];
- const tran_low_t *ip = in;
+ const tran_low_t *ip = input;
tran_low_t *op = tmp;
(void)bd;
*op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
}
-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
- const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
const int8_t mask =
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
- filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+ filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
++s;
}
}
-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
- vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
- vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+ vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
}
void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
}
}
-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
- const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
const int8_t mask =
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
- s + 1 * p, s + 2 * p, s + 3 * p);
+ filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
+ s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
++s;
}
}
-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
- vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
- vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+ vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
+ vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
}
void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
}
}
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
+ const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int count) {
int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8 * count; ++i) {
- const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
const int8_t mask =
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 =
- flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
- s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
- filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
- s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
- s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
- s + 7 * p);
+ const int8_t flat2 = flat_mask5(
+ 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+ s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
+
+ filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+ s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+ s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
+ s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
+ s + 7 * pitch);
++s;
}
}
-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+ mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
}
-void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+ mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
}
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int count) {
int i;
filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
s + 7);
- s += p;
+ s += pitch;
}
}
-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+ mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
}
-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+ mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
}
#if CONFIG_VP9_HIGHBITDEPTH
*op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
}
-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh, int bd) {
int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
- const uint16_t p3 = s[-4 * p];
- const uint16_t p2 = s[-3 * p];
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const uint16_t q2 = s[2 * p];
- const uint16_t q3 = s[3 * p];
+ const uint16_t p3 = s[-4 * pitch];
+ const uint16_t p2 = s[-3 * pitch];
+ const uint16_t p1 = s[-2 * pitch];
+ const uint16_t p0 = s[-pitch];
+ const uint16_t q0 = s[0 * pitch];
+ const uint16_t q1 = s[1 * pitch];
+ const uint16_t q2 = s[2 * pitch];
+ const uint16_t q3 = s[3 * pitch];
const int8_t mask =
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+ highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
+ s + 1 * pitch, bd);
++s;
}
}
void vpx_highbd_lpf_horizontal_4_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
- vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+ vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
}
void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
}
}
-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
int i;
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
- const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+ const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+ p0 = s[-pitch];
+ const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+ q3 = s[3 * pitch];
const int8_t mask =
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat =
highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
- s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+ highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
+ s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+ s + 2 * pitch, s + 3 * pitch, bd);
++s;
}
}
void vpx_highbd_lpf_horizontal_8_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
- vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
- vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+ vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
}
void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
}
}
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int count,
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8 * count; ++i) {
- const uint16_t p3 = s[-4 * p];
- const uint16_t p2 = s[-3 * p];
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const uint16_t q2 = s[2 * p];
- const uint16_t q3 = s[3 * p];
+ const uint16_t p3 = s[-4 * pitch];
+ const uint16_t p2 = s[-3 * pitch];
+ const uint16_t p1 = s[-2 * pitch];
+ const uint16_t p0 = s[-pitch];
+ const uint16_t q0 = s[0 * pitch];
+ const uint16_t q1 = s[1 * pitch];
+ const uint16_t q2 = s[2 * pitch];
+ const uint16_t q3 = s[3 * pitch];
const int8_t mask =
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
const int8_t flat =
highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat2 =
- highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
- s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
- highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
- s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
- s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
- s + 6 * p, s + 7 * p, bd);
+ const int8_t flat2 = highbd_flat_mask5(
+ 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+ s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
+
+ highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+ s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+ s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+ s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
+ s + 6 * pitch, s + 7 * pitch, bd);
++s;
}
}
-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
}
-void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+ highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
}
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int count,
highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
s + 5, s + 6, s + 7, bd);
- s += p;
+ s += pitch;
}
}
-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh,
int bd) {
- highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+ highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
}
-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
- highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+ highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
}
-void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
- int32_t rows, int32_t cols, int32_t flimit) {
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows,
+ int32_t cols, int32_t flimit) {
int32_t row, col, cnt;
- uint8_t *src_dup = src_ptr;
- v16u8 src0, src, tmp_orig;
+ uint8_t *src_dup = src;
+ v16u8 src0, src1, tmp_orig;
v16u8 tmp = { 0 };
v16i8 zero = { 0 };
v8u16 sum_h, src_r_h, src_l_h;
src_dup[cols + 16] = src_dup[cols - 1];
tmp_orig = (v16u8)__msa_ldi_b(0);
tmp_orig[15] = tmp[15];
- src = LD_UB(src_dup - 8);
- src[15] = 0;
- ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+ src1 = LD_UB(src_dup - 8);
+ src1[15] = 0;
+ ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
sum_sq = HADD_SW_S32(src_r_w) + 16;
- sum_h = __msa_hadd_u_h(src, src);
+ sum_h = __msa_hadd_u_h(src1, src1);
sum = HADD_UH_U32(sum_h);
{
v16u8 src7, src8, src_r, src_l;
sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
}
sum = sum_l[7];
- src = LD_UB(src_dup + 16 * col);
- ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+ src1 = LD_UB(src_dup + 16 * col);
+ ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
total3 = (total3 < flimit_vec);
PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
- tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+ tmp = __msa_bmz_v(tmp, src1, (v16u8)mask);
if (col == 0) {
uint64_t src_d;
void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ const int16_t dequant, uint16_t *eob_ptr) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 16;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
if (tmp) eob = 0;
}
*eob_ptr = eob + 1;
void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
int skip_block, const int16_t *round_ptr,
const int16_t quant, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant,
uint16_t *eob_ptr) {
int eob = -1;
const int64_t tmp = abs_coeff + round_ptr[0];
const int abs_qcoeff = (int)((tmp * quant) >> 16);
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
if (abs_qcoeff) eob = 0;
}
*eob_ptr = eob + 1;
void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr, uint16_t *eob_ptr) {
+ const int16_t dequant, uint16_t *eob_ptr) {
const int n_coeffs = 1024;
const int rc = 0;
const int coeff = coeff_ptr[rc];
INT16_MIN, INT16_MAX);
tmp = (tmp * quant) >> 15;
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
if (tmp) eob = 0;
}
*eob_ptr = eob + 1;
const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr,
- uint16_t *eob_ptr) {
+ const int16_t dequant, uint16_t *eob_ptr) {
const int n_coeffs = 1024;
int eob = -1;
const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
const int abs_qcoeff = (int)((tmp * quant) >> 15);
qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+ dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
if (abs_qcoeff) eob = 0;
}
*eob_ptr = eob + 1;
#endif
void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
- const int16_t *round_ptr, const int16_t quant_ptr,
+ const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr, uint16_t *eob_ptr);
+ const int16_t dequant, uint16_t *eob_ptr);
void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
- const int16_t *round_ptr, const int16_t quant_ptr,
+ const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr, uint16_t *eob_ptr);
+ const int16_t dequant, uint16_t *eob_ptr);
#if CONFIG_VP9_HIGHBITDEPTH
void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
int skip_block, const int16_t *round_ptr,
- const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant,
uint16_t *eob_ptr);
void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
- const int16_t *round_ptr,
- const int16_t quant_ptr,
+ const int16_t *round_ptr, const int16_t quant,
tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr,
- const int16_t dequant_ptr, uint16_t *eob_ptr);
+ const int16_t dequant, uint16_t *eob_ptr);
#endif
#ifdef __cplusplus
#include "vpx_ports/mem.h"
/* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
+static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int width, int height) {
int y, x;
unsigned int sad = 0;
for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+ for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]);
- a += a_stride;
- b += b_stride;
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
}
return sad;
}
-#define sadMxN(m, n) \
- unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- return sad(src, src_stride, ref, ref_stride, m, n); \
- } \
- unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \
- vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
- return sad(src, src_stride, comp_pred, m, m, n); \
+#define sadMxN(m, n) \
+ unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+ } \
+ unsigned int vpx_sad##m##x##n##_avg_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]); \
+ vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \
+ return sad(src_ptr, src_stride, comp_pred, m, m, n); \
}
// depending on call sites, pass **ref_array to avoid & in subsequent call and
// de-dup with 4D below.
-#define sadMxNxK(m, n, k) \
- void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref_array, int ref_stride, \
- uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < k; ++i) \
- sad_array[i] = \
- vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+#define sadMxNxK(m, n, k) \
+ void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride, \
+ uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < k; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
}
// This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n) \
- void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) \
- sad_array[i] = \
- vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+#define sadMxNx4D(m, n) \
+ void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sad_array[i] = \
+ vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
}
/* clang-format off */
#if CONFIG_VP9_HIGHBITDEPTH
static INLINE
- unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
- int b_stride, int width, int height) {
+ unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride,
+ const uint8_t *ref8_ptr, int ref_stride, int width,
+ int height) {
int y, x;
unsigned int sad = 0;
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+ for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
- a += a_stride;
- b += b_stride;
+ src += src_stride;
+ ref_ptr += ref_stride;
}
return sad;
}
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
- const uint16_t *b, int b_stride,
+static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
int width, int height) {
int y, x;
unsigned int sad = 0;
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+ for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
- a += a_stride;
- b += b_stride;
+ src += src_stride;
+ ref_ptr += ref_stride;
}
return sad;
}
#define highbd_sadMxN(m, n) \
- unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, \
- int ref_stride) { \
- return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
+ unsigned int vpx_highbd_sad##m##x##n##_c( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
} \
unsigned int vpx_highbd_sad##m##x##n##_avg_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]); \
vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
- n, CONVERT_TO_SHORTPTR(ref), ref_stride); \
- return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
+ n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride); \
+ return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \
}
-#define highbd_sadMxNx4D(m, n) \
- void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) { \
- sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, \
- ref_array[i], ref_stride); \
- } \
+#define highbd_sadMxNx4D(m, n) \
+ void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[], \
+ int ref_stride, uint32_t *sad_array) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
}
/* clang-format off */
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
-void vpx_subtract_block_c(int rows, int cols, int16_t *diff,
- ptrdiff_t diff_stride, const uint8_t *src,
- ptrdiff_t src_stride, const uint8_t *pred,
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
ptrdiff_t pred_stride) {
int r, c;
for (r = 0; r < rows; r++) {
- for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+ for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c];
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
+ diff_ptr += diff_stride;
+ pred_ptr += pred_stride;
+ src_ptr += src_stride;
}
}
#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
- ptrdiff_t diff_stride, const uint8_t *src8,
- ptrdiff_t src_stride, const uint8_t *pred8,
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src8_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred8_ptr,
ptrdiff_t pred_stride, int bd) {
int r, c;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
(void)bd;
for (r = 0; r < rows; r++) {
for (c = 0; c < cols; c++) {
- diff[c] = src[c] - pred[c];
+ diff_ptr[c] = src[c] - pred[c];
}
- diff += diff_stride;
+ diff_ptr += diff_stride;
pred += pred_stride;
src += src_stride;
}
# Intra prediction
#
-add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d207_predictor_4x4 sse2/;
-add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d45_predictor_4x4 neon sse2/;
-add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d63_predictor_4x4 ssse3/;
-add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/;
-add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d135_predictor_4x4 neon/;
-add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d153_predictor_4x4 ssse3/;
-add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
-add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/;
-add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
-add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;
-add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;
-add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
-add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d207_predictor_8x8 ssse3/;
-add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/;
-add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/;
-add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/;
-add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d135_predictor_8x8 neon/;
-add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d153_predictor_8x8 ssse3/;
-add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
-add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/;
-add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/;
-add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
-add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;
-add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
-add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d207_predictor_16x16 ssse3/;
-add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
-add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
-add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
-add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d135_predictor_16x16 neon/;
-add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d153_predictor_16x16 ssse3/;
-add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
-add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
-add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
-add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
-add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
-add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
-add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d207_predictor_32x32 ssse3/;
-add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
-add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
-add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
-add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d135_predictor_32x32 neon/;
-add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_d153_predictor_32x32 ssse3/;
-add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
-add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
-add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
-add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
-add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
-add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
# High bitdepth functions
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
- add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
- add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
- add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
- add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
- add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
- add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
- add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
- add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
- add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
- add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
- add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
- add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
- add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
- add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
- add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
- add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
- add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
- add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
- add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
- add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
- add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
- add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
- add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
- add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
- add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
- add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
- add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
- add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
- add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
- add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
- add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
- add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
- add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
- add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
- add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
- add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
- add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
- add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
- add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
- add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
- add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
- add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
- add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
- add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
} # CONFIG_VP9_HIGHBITDEPTH
#
# Sub Pixel Filters
#
- add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
- add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
- add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
- add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+ add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
} # CONFIG_VP9_HIGHBITDEPTH
#
# Multi-block SAD, comparing a reference to N independent blocks
#
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
#
# Block subtraction
#
- add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+ add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
#
# Single block SAD
#
# Avg
#
- add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+ add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
specialize qw/vpx_highbd_avg_8x8 sse2/;
- add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+ add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
specialize qw/vpx_highbd_avg_4x4 sse2/;
- add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
specialize qw/vpx_highbd_sad64x64_avg sse2/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
- add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad64x64x4d sse2/;
- add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad64x32x4d sse2/;
- add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x64x4d sse2/;
- add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x32x4d sse2/;
- add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad32x16x4d sse2/;
- add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x32x4d sse2/;
- add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x16x4d sse2/;
- add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad16x8x4d sse2/;
- add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x16x4d sse2/;
- add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x8x4d sse2/;
- add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad8x4x4d sse2/;
- add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad4x8x4d sse2/;
- add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
specialize qw/vpx_highbd_sad4x4x4d sse2/;
#
add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;
- add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
+ add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit";
specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
#include "vpx/vpx_integer.h"
#include "vpx_dsp/x86/mem_sse2.h"
-void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
- /* comp and pred must be 16 byte aligned. */
- assert(((intptr_t)comp & 0xf) == 0);
+ /* comp_pred and pred must be 16 byte aligned. */
+ assert(((intptr_t)comp_pred & 0xf) == 0);
assert(((intptr_t)pred & 0xf) == 0);
if (width > 8) {
int x, y;
const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
const __m128i avg = _mm_avg_epu8(p, r);
- _mm_store_si128((__m128i *)(comp + x), avg);
+ _mm_store_si128((__m128i *)(comp_pred + x), avg);
}
- comp += width;
+ comp_pred += width;
pred += width;
ref += ref_stride;
}
} else { // width must be 4 or 8.
int i;
- // Process 16 elements at a time. comp and pred have width == stride and
- // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all
- // divisible by 16 so just ref needs to be massaged when loading.
+ // Process 16 elements at a time. comp_pred and pred have width == stride
+ // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
+ // all divisible by 16 so just ref needs to be massaged when loading.
for (i = 0; i < width * height; i += 16) {
const __m128i p = _mm_load_si128((const __m128i *)pred);
__m128i r;
ref += 2 * ref_stride;
}
avg = _mm_avg_epu8(p, r);
- _mm_store_si128((__m128i *)comp, avg);
+ _mm_store_si128((__m128i *)comp_pred, avg);
pred += 16;
- comp += 16;
+ comp_pred += 16;
}
}
}
#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \
void vpx_convolve8_##name##_##opt( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
- const int16_t *filter = filter_kernel[offset]; \
+ const int16_t *filter_row = filter[offset]; \
(void)x0_q4; \
(void)x_step_q4; \
(void)y0_q4; \
(void)y_step_q4; \
- assert(filter[3] != 128); \
+ assert(filter_row[3] != 128); \
assert(step_q4 == 16); \
- if (filter[0] | filter[1] | filter[6] | filter[7]) { \
+ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
while (w >= 16) { \
vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
src += 16; \
dst += 16; \
w -= 16; \
} \
if (w == 8) { \
vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
} else if (w == 4) { \
vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
} \
- } else if (filter[2] | filter[5]) { \
+ } else if (filter_row[2] | filter_row[5]) { \
while (w >= 16) { \
vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
src += 16; \
dst += 16; \
w -= 16; \
} \
if (w == 8) { \
vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
} else if (w == 4) { \
vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
} \
} else { \
while (w >= 16) { \
vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
src += 16; \
dst += 16; \
w -= 16; \
} \
if (w == 8) { \
vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
} else if (w == 4) { \
vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
- dst_stride, h, filter); \
+ dst_stride, h, filter_row); \
} \
} \
}
unsigned int output_height,
const int16_t *filter, int bd);
-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \
- void vpx_highbd_convolve8_##name##_##opt( \
- const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
- ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \
- int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
- const int16_t *filter = filter_kernel[offset]; \
- if (step_q4 == 16 && filter[3] != 128) { \
- if (filter[0] | filter[1] | filter[6] | filter[7]) { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else if (filter[2] | filter[5]) { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
- filter_kernel, x0_q4, x_step_q4, y0_q4, \
- y_step_q4, w, h, bd); \
- } \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \
+ void vpx_highbd_convolve8_##name##_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \
+ const int16_t *filter_row = filter[offset]; \
+ if (step_q4 == 16 && filter_row[3] != 128) { \
+ if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter_row[2] | filter_row[5]) { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter_row, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h, bd); \
+ } \
}
#define HIGH_FUN_CONV_2D(avg, opt) \
uint16_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
- int width, int h, int bd) {
+ int w, int h, int bd) {
(void)filter;
(void)x0_q4;
(void)x_step_q4;
(void)y_step_q4;
(void)bd;
- assert(width % 4 == 0);
- if (width > 32) { // width = 64
+ assert(w % 4 == 0);
+ if (w > 32) { // w = 64
do {
const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
dst += dst_stride;
h--;
} while (h > 0);
- } else if (width > 16) { // width = 32
+ } else if (w > 16) { // w = 32
do {
const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
dst += dst_stride;
h--;
} while (h > 0);
- } else if (width > 8) { // width = 16
+ } else if (w > 8) { // w = 16
__m256i p0, p1;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
dst += dst_stride;
h -= 2;
} while (h > 0);
- } else if (width > 4) { // width = 8
+ } else if (w > 4) { // w = 8
__m128i p0, p1;
do {
p0 = _mm_loadu_si128((const __m128i *)src);
dst += dst_stride;
h -= 2;
} while (h > 0);
- } else { // width = 4
+ } else { // w = 4
__m128i p0, p1;
do {
p0 = _mm_loadl_epi64((const __m128i *)src);
uint16_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
int x_step_q4, int y0_q4, int y_step_q4,
- int width, int h, int bd) {
+ int w, int h, int bd) {
(void)filter;
(void)x0_q4;
(void)x_step_q4;
(void)y_step_q4;
(void)bd;
- assert(width % 4 == 0);
- if (width > 32) { // width = 64
+ assert(w % 4 == 0);
+ if (w > 32) { // w = 64
__m256i p0, p1, p2, p3, u0, u1, u2, u3;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
dst += dst_stride;
h--;
} while (h > 0);
- } else if (width > 16) { // width = 32
+ } else if (w > 16) { // w = 32
__m256i p0, p1, u0, u1;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
dst += dst_stride;
h--;
} while (h > 0);
- } else if (width > 8) { // width = 16
+ } else if (w > 8) { // w = 16
__m256i p0, p1, u0, u1;
do {
p0 = _mm256_loadu_si256((const __m256i *)src);
dst += dst_stride << 1;
h -= 2;
} while (h > 0);
- } else if (width > 4) { // width = 8
+ } else if (w > 4) { // w = 8
__m128i p0, p1, u0, u1;
do {
p0 = _mm_loadu_si128((const __m128i *)src);
dst += dst_stride << 1;
h -= 2;
} while (h > 0);
- } else { // width = 4
+ } else { // w = 4
__m128i p0, p1, u0, u1;
do {
p0 = _mm_loadl_epi64((const __m128i *)src);
REP_RET
INIT_XMM sse2
-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
movd m1, [aboveq-2]
movq m0, [aboveq]
pshuflw m1, m1, 0x0
movlhps m1, m1 ; tl tl tl tl tl tl tl tl
; Get the values to compute the maximum value at this bit depth
pcmpeqw m3, m3
- movd m4, bpsd
+ movd m4, bdd
psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl
psllw m3, m4
pcmpeqw m2, m2
RET
INIT_XMM sse2
-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
movd m1, [aboveq-2]
mova m0, [aboveq]
pshuflw m1, m1, 0x0
pxor m3, m3
pxor m4, m4
pinsrw m3, oned, 0
- pinsrw m4, bpsd, 0
+ pinsrw m4, bdd, 0
pshuflw m3, m3, 0x0
DEFINE_ARGS dst, stride, line, left
punpcklqdq m3, m3
REP_RET
INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
movd m2, [aboveq-2]
mova m0, [aboveq]
mova m1, [aboveq+16]
pshuflw m2, m2, 0x0
; Get the values to compute the maximum value at this bit depth
pcmpeqw m3, m3
- movd m4, bpsd
+ movd m4, bdd
punpcklqdq m2, m2
psllw m3, m4
pcmpeqw m5, m5
REP_RET
INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
movd m0, [aboveq-2]
mova m1, [aboveq]
mova m2, [aboveq+16]
pshuflw m0, m0, 0x0
; Get the values to compute the maximum value at this bit depth
pcmpeqw m5, m5
- movd m6, bpsd
+ movd m6, bdd
psllw m5, m6
pcmpeqw m7, m7
pxor m6, m6 ; min possible value
// TODO(debargha, peter): Break up large functions into smaller ones
// in this file.
-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi16(1);
- __m128i blimit, limit, thresh;
+ __m128i blimit_v, limit_v, thresh_v;
__m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
__m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
__m128i ps1, qs1, ps0, qs0;
__m128i eight, four;
if (bd == 8) {
- blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
- limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
- thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
} else if (bd == 10) {
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
} else { // bd == 12
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
}
- q4 = _mm_load_si128((__m128i *)(s + 4 * p));
- p4 = _mm_load_si128((__m128i *)(s - 5 * p));
- q3 = _mm_load_si128((__m128i *)(s + 3 * p));
- p3 = _mm_load_si128((__m128i *)(s - 4 * p));
- q2 = _mm_load_si128((__m128i *)(s + 2 * p));
- p2 = _mm_load_si128((__m128i *)(s - 3 * p));
- q1 = _mm_load_si128((__m128i *)(s + 1 * p));
- p1 = _mm_load_si128((__m128i *)(s - 2 * p));
- q0 = _mm_load_si128((__m128i *)(s + 0 * p));
- p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+ q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
+ p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
+ q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+ p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+ q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+ p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+ q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+ p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+ q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+ p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
// highbd_filter_mask
abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
// highbd_hev_mask (in C code this is actually called from highbd_filter4)
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_subs_epu16(flat, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
work = _mm_max_epi16(
_mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
_mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
_mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
- mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_subs_epu16(mask, limit_v);
mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
// lp filter
// (because, in both vars, each block of 16 either all 1s or all 0s)
flat = _mm_and_si128(flat, mask);
- p5 = _mm_load_si128((__m128i *)(s - 6 * p));
- q5 = _mm_load_si128((__m128i *)(s + 5 * p));
- p6 = _mm_load_si128((__m128i *)(s - 7 * p));
- q6 = _mm_load_si128((__m128i *)(s + 6 * p));
- p7 = _mm_load_si128((__m128i *)(s - 8 * p));
- q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+ p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
+ q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
+ p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
+ q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
+ p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
+ q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
// highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
// but referred to as p0-p4 & q0-q4 in fn)
flat2_q6 = _mm_and_si128(flat2, flat2_q6);
// get values for when (flat2 && flat && mask)
q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
- _mm_store_si128((__m128i *)(s - 7 * p), p6);
- _mm_store_si128((__m128i *)(s + 6 * p), q6);
+ _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
+ _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
p5 = _mm_andnot_si128(flat2, p5);
// p5 remains unchanged if !(flat2 && flat && mask)
// get values for when (flat2 && flat && mask)
q5 = _mm_or_si128(q5, flat2_q5);
// full list of q5 values
- _mm_store_si128((__m128i *)(s - 6 * p), p5);
- _mm_store_si128((__m128i *)(s + 5 * p), q5);
+ _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
+ _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
p4 = _mm_andnot_si128(flat2, p4);
// p4 remains unchanged if !(flat2 && flat && mask)
flat2_q4 = _mm_and_si128(flat2, flat2_q4);
// get values for when (flat2 && flat && mask)
q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
- _mm_store_si128((__m128i *)(s - 5 * p), p4);
- _mm_store_si128((__m128i *)(s + 4 * p), q4);
+ _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
+ _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
p3 = _mm_andnot_si128(flat2, p3);
// p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
flat2_q3 = _mm_and_si128(flat2, flat2_q3);
// get values for when (flat2 && flat && mask)
q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
- _mm_store_si128((__m128i *)(s - 4 * p), p3);
- _mm_store_si128((__m128i *)(s + 3 * p), q3);
+ _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
+ _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
p2 = _mm_andnot_si128(flat2, p2);
// p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
flat2_q2 = _mm_and_si128(flat2, flat2_q2);
// get values for when (flat2 && flat && mask)
q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
- _mm_store_si128((__m128i *)(s - 3 * p), p2);
- _mm_store_si128((__m128i *)(s + 2 * p), q2);
+ _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
p1 = _mm_andnot_si128(flat2, p1);
// p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
flat2_q1 = _mm_and_si128(flat2, flat2_q1);
// get values for when (flat2 && flat && mask)
q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
- _mm_store_si128((__m128i *)(s - 2 * p), p1);
- _mm_store_si128((__m128i *)(s + 1 * p), q1);
+ _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
p0 = _mm_andnot_si128(flat2, p0);
// p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
flat2_q0 = _mm_and_si128(flat2, flat2_q0);
// get values for when (flat2 && flat && mask)
q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
- _mm_store_si128((__m128i *)(s - 1 * p), p0);
- _mm_store_si128((__m128i *)(s - 0 * p), q0);
+ _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
}
-void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
- vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
+ vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
}
-void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
const __m128i zero = _mm_set1_epi16(0);
- __m128i blimit, limit, thresh;
+ __m128i blimit_v, limit_v, thresh_v;
__m128i mask, hev, flat;
- __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
- __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
- __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
- __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
- __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
- __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
- __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+ __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+ __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+ __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+ __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+ __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+ __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+ __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+ __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
const __m128i one = _mm_set1_epi16(1);
const __m128i ffff = _mm_cmpeq_epi16(one, one);
__m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
__m128i filter1, filter2;
if (bd == 8) {
- blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
- limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
- thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
t80 = _mm_set1_epi16(0x80);
} else if (bd == 10) {
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
t80 = _mm_set1_epi16(0x200);
} else { // bd == 12
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
t80 = _mm_set1_epi16(0x800);
}
abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_subs_epu16(flat, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
// So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
mask = _mm_max_epi16(abs_p1p0, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
mask = _mm_max_epi16(abs_q1q0, mask);
_mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
_mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
- mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_subs_epu16(mask, limit_v);
mask = _mm_cmpeq_epi16(mask, zero);
// flat_mask4
q1 = _mm_and_si128(flat, q1);
q1 = _mm_or_si128(work_a, q1);
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
q2 = _mm_load_si128((__m128i *)flat_oq2);
work_a = _mm_andnot_si128(flat, work_a);
q2 = _mm_and_si128(flat, q2);
p1 = _mm_and_si128(flat, p1);
p1 = _mm_or_si128(work_a, p1);
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
p2 = _mm_load_si128((__m128i *)flat_op2);
work_a = _mm_andnot_si128(flat, work_a);
p2 = _mm_and_si128(flat, p2);
p2 = _mm_or_si128(work_a, p2);
- _mm_store_si128((__m128i *)(s - 3 * p), p2);
- _mm_store_si128((__m128i *)(s - 2 * p), p1);
- _mm_store_si128((__m128i *)(s - 1 * p), p0);
- _mm_store_si128((__m128i *)(s + 0 * p), q0);
- _mm_store_si128((__m128i *)(s + 1 * p), q1);
- _mm_store_si128((__m128i *)(s + 2 * p), q2);
+ _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+ _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
}
void vpx_highbd_lpf_horizontal_8_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
- vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
}
-void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
const __m128i zero = _mm_set1_epi16(0);
- __m128i blimit, limit, thresh;
+ __m128i blimit_v, limit_v, thresh_v;
__m128i mask, hev, flat;
- __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
const __m128i abs_p1p0 =
_mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
const __m128i abs_q1q0 =
__m128i filter1, filter2;
if (bd == 8) {
- blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
- limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
- thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+ limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+ thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
t80 = _mm_set1_epi16(0x80);
tff80 = _mm_set1_epi16(0xff80);
tffe0 = _mm_set1_epi16(0xffe0);
t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
} else if (bd == 10) {
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
} else { // bd == 12
- blimit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
- limit = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
- thresh = _mm_slli_epi16(
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ blimit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+ limit_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+ thresh_v = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
}
- ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
- ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
- qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
- qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+ ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+ ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+ qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+ qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
// filter_mask and hev_mask
flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu16(flat, thresh);
+ hev = _mm_subs_epu16(flat, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
// So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
mask = _mm_max_epi16(flat, mask);
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
_mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
_mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
mask = _mm_max_epi16(work, mask);
- mask = _mm_subs_epu16(mask, limit);
+ mask = _mm_subs_epu16(mask, limit_v);
mask = _mm_cmpeq_epi16(mask, zero);
// filter4
p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
t80);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
}
void vpx_highbd_lpf_horizontal_4_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
- vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+ vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
}
static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
highbd_transpose(src1, in_p, dest1, out_p, 1);
}
-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
uint16_t *src[1];
uint16_t *dst[1];
src[0] = s - 4;
dst[0] = t_dst;
- highbd_transpose(src, p, dst, 8, 1);
+ highbd_transpose(src, pitch, dst, 8, 1);
// Loop filtering
vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
dst[0] = s - 4;
// Transpose back
- highbd_transpose(src, 8, dst, p, 1);
+ highbd_transpose(src, 8, dst, pitch, 1);
}
void vpx_highbd_lpf_vertical_4_dual_sse2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
uint16_t *dst[2];
// Transpose 8x16
- highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+ highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
// Loop filtering
vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
src[0] = t_dst;
src[1] = t_dst + 8;
dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
+ dst[1] = s - 4 + pitch * 8;
// Transpose back
- highbd_transpose(src, 16, dst, p, 2);
+ highbd_transpose(src, 16, dst, pitch, 2);
}
-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit, const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
uint16_t *src[1];
uint16_t *dst[1];
src[0] = s - 4;
dst[0] = t_dst;
- highbd_transpose(src, p, dst, 8, 1);
+ highbd_transpose(src, pitch, dst, 8, 1);
// Loop filtering
vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
dst[0] = s - 4;
// Transpose back
- highbd_transpose(src, 8, dst, p, 1);
+ highbd_transpose(src, 8, dst, pitch, 1);
}
void vpx_highbd_lpf_vertical_8_dual_sse2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
uint16_t *dst[2];
// Transpose 8x16
- highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+ highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
// Loop filtering
vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
src[1] = t_dst + 8;
dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
+ dst[1] = s - 4 + pitch * 8;
// Transpose back
- highbd_transpose(src, 16, dst, p, 2);
+ highbd_transpose(src, 16, dst, pitch, 2);
}
-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
dst[1] = t_dst + 8 * 8;
// Transpose 16x8
- highbd_transpose(src, p, dst, 8, 2);
+ highbd_transpose(src, pitch, dst, 8, 2);
// Loop filtering
vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
dst[1] = s;
// Transpose back
- highbd_transpose(src, 8, dst, p, 2);
+ highbd_transpose(src, 8, dst, pitch, 2);
}
-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
const uint8_t *blimit,
const uint8_t *limit,
const uint8_t *thresh, int bd) {
DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
// Transpose 16x16
- highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
- highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+ highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
// Loop filtering
vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
thresh, bd);
// Transpose back
- highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
- highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+ highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
+ pitch);
}
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/mem.h"
-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
__m128i mask, hev, flat, flat2;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
__m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
__m128i abs_p1p0;
- const __m128i thresh =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
- const __m128i blimit =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+ const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
- q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
q4p4 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
- q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
q3p3 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
- q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
q2p2 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
- q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
q1p1 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
p1q1 = _mm_shuffle_epi32(q1p1, 78);
- q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
q0p0 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
p0q0 = _mm_shuffle_epi32(q0p0, 78);
{
abs_p1q1 =
_mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_subs_epu8(flat, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(abs_p1p0, mask);
_mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
mask = _mm_max_epu8(work, mask);
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_subs_epu8(mask, limit_v);
mask = _mm_cmpeq_epi8(mask, zero);
}
flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask);
- q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
q5p5 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
- q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
q6p6 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
flat2 = _mm_max_epu8(
_mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
_mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
- q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
q7p7 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
work = _mm_max_epu8(
_mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
q6p6 = _mm_andnot_si128(flat2, q6p6);
flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+ _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
q5p5 = _mm_andnot_si128(flat2, q5p5);
flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+ _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
q4p4 = _mm_andnot_si128(flat2, q4p4);
flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+ _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
q3p3 = _mm_andnot_si128(flat2, q3p3);
flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+ _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
q2p2 = _mm_andnot_si128(flat2, q2p2);
flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
q1p1 = _mm_andnot_si128(flat2, q1p1);
flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
q0p0 = _mm_andnot_si128(flat2, q0p0);
flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
}
}
8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
};
-void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
__m128i mask, hev, flat, flat2;
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
__m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
- const __m128i thresh =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
- const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
- const __m128i blimit =
- _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
- p256_4 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
- p256_3 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
- p256_2 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
- p256_1 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
- p256_0 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
- q256_0 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
- q256_1 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
- q256_2 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
- q256_3 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
- q256_4 =
- _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+ const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+ p256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));
+ p256_3 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));
+ p256_2 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));
+ p256_1 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));
+ p256_0 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));
+ q256_0 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));
+ q256_1 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));
+ q256_2 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));
+ q256_3 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));
+ q256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));
p4 = _mm256_castsi256_si128(p256_4);
p3 = _mm256_castsi256_si128(p256_3);
_mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
__m128i work;
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_subs_epu8(flat, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(flat, mask);
_mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
_mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_subs_epu8(mask, limit_v);
mask = _mm_cmpeq_epi8(mask, zero);
}
flat = _mm_and_si128(flat, mask);
p256_5 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+ _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));
q256_5 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+ _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));
p5 = _mm256_castsi256_si128(p256_5);
q5 = _mm256_castsi256_si128(q256_5);
flat2 = _mm_max_epu8(
flat2 = _mm_max_epu8(work, flat2);
p256_6 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+ _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));
q256_6 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+ _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));
p6 = _mm256_castsi256_si128(p256_6);
q6 = _mm256_castsi256_si128(q256_6);
work = _mm_max_epu8(
flat2 = _mm_max_epu8(work, flat2);
p256_7 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+ _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));
q256_7 = _mm256_castpd_si256(
- _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+ _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));
p7 = _mm256_castsi256_si128(p256_7);
q7 = _mm256_castsi256_si128(q256_7);
work = _mm_max_epu8(
p6 = _mm_andnot_si128(flat2, p6);
flat2_p6 = _mm_and_si128(flat2, flat2_p6);
p6 = _mm_or_si128(flat2_p6, p6);
- _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+ _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
p5 = _mm_andnot_si128(flat2, p5);
flat2_p5 = _mm_and_si128(flat2, flat2_p5);
p5 = _mm_or_si128(flat2_p5, p5);
- _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+ _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
p4 = _mm_andnot_si128(flat2, p4);
flat2_p4 = _mm_and_si128(flat2, flat2_p4);
p4 = _mm_or_si128(flat2_p4, p4);
- _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+ _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
p3 = _mm_andnot_si128(flat2, p3);
flat2_p3 = _mm_and_si128(flat2, flat2_p3);
p3 = _mm_or_si128(flat2_p3, p3);
- _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+ _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
p2 = _mm_andnot_si128(flat2, p2);
flat2_p2 = _mm_and_si128(flat2, flat2_p2);
p2 = _mm_or_si128(flat2_p2, p2);
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
p1 = _mm_andnot_si128(flat2, p1);
flat2_p1 = _mm_and_si128(flat2, flat2_p1);
p1 = _mm_or_si128(flat2_p1, p1);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
p0 = _mm_andnot_si128(flat2, p0);
flat2_p0 = _mm_and_si128(flat2, flat2_p0);
p0 = _mm_or_si128(flat2_p0, p0);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
q0 = _mm_andnot_si128(flat2, q0);
flat2_q0 = _mm_and_si128(flat2, flat2_q0);
q0 = _mm_or_si128(flat2_q0, q0);
- _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);
q1 = _mm_andnot_si128(flat2, q1);
flat2_q1 = _mm_and_si128(flat2, flat2_q1);
q1 = _mm_or_si128(flat2_q1, q1);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
q2 = _mm_andnot_si128(flat2, q2);
flat2_q2 = _mm_and_si128(flat2, flat2_q2);
q2 = _mm_or_si128(flat2_q2, q2);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
q3 = _mm_andnot_si128(flat2, q3);
flat2_q3 = _mm_and_si128(flat2, flat2_q3);
q3 = _mm_or_si128(flat2_q3, q3);
- _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
q4 = _mm_andnot_si128(flat2, q4);
flat2_q4 = _mm_and_si128(flat2, flat2_q4);
q4 = _mm_or_si128(flat2_q4, q4);
- _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
q5 = _mm_andnot_si128(flat2, q5);
flat2_q5 = _mm_and_si128(flat2, flat2_q5);
q5 = _mm_or_si128(flat2_q5, q5);
- _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
q6 = _mm_andnot_si128(flat2, q6);
flat2_q6 = _mm_and_si128(flat2, flat2_q6);
q6 = _mm_or_si128(flat2_q6, q6);
- _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
}
}
/* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
hev = \
_mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
- hev = _mm_cmpgt_epi16(hev, thresh); \
+ hev = _mm_cmpgt_epi16(hev, thresh_v); \
hev = _mm_packs_epi16(hev, hev); \
\
/* const int8_t mask = filter_mask(*limit, *blimit, */ \
flat = _mm_max_epu8(work, flat); \
flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
mask = _mm_unpacklo_epi64(mask, flat); \
- mask = _mm_subs_epu8(mask, limit); \
+ mask = _mm_subs_epu8(mask, limit_v); \
mask = _mm_cmpeq_epi8(mask, zero); \
mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
} while (0)
ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
} while (0)
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
- const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh) {
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
const __m128i zero = _mm_set1_epi16(0);
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- const __m128i thresh =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+ const __m128i limit_v =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+ _mm_loadl_epi64((const __m128i *)limit));
+ const __m128i thresh_v =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
- p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
- _mm_loadl_epi64((__m128i *)(s - 4 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- _mm_loadl_epi64((__m128i *)(s + 0 * p)));
- q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+ p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
+ q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
FILTER_HEV_MASK;
FILTER4;
- _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
- _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
- _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
+ _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0)); // *op1
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0); // *op0
+ _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0); // *oq0
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0)); // *oq1
}
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
- const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh) {
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
const __m128i zero = _mm_set1_epi16(0);
- const __m128i limit =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- const __m128i thresh =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+ const __m128i limit_v =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+ _mm_loadl_epi64((const __m128i *)limit));
+ const __m128i thresh_v =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i x0, x1, x2, x3;
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+ q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+ x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+ x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
- _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+ x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
+ _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
// Transpose 8x8
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
- storeu_uint32(s + 0 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- storeu_uint32(s + 1 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- storeu_uint32(s + 2 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
ps1ps0 = _mm_srli_si128(ps1ps0, 4);
- storeu_uint32(s + 3 * p - 2, _mm_cvtsi128_si32(ps1ps0));
+ storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
- storeu_uint32(s + 4 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- storeu_uint32(s + 5 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- storeu_uint32(s + 6 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
qs1qs0 = _mm_srli_si128(qs1qs0, 4);
- storeu_uint32(s + 7 * p - 2, _mm_cvtsi128_si32(qs1qs0));
+ storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
}
-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
__m128i mask, hev, flat, flat2;
__m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
__m128i abs_p1p0;
- q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+ q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
q4p4 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
- q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+ q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
q3p3 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
- q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+ q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
q2p2 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
- q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+ q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
q1p1 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
p1q1 = _mm_shuffle_epi32(q1p1, 78);
- q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
q0p0 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
p0q0 = _mm_shuffle_epi32(q0p0, 78);
{
abs_p0q0 = abs_diff(q0p0, p0q0);
abs_p1q1 = abs_diff(q1p1, p1q1);
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_subs_epu8(flat, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(abs_p1p0, mask);
work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
mask = _mm_max_epu8(work, mask);
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_subs_epu8(mask, limit_v);
mask = _mm_cmpeq_epi8(mask, zero);
}
flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask);
- q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+ q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
q5p5 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
- q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+ q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
q6p6 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
- q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+ q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
q7p7 = _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
flat2 = _mm_max_epu8(work, flat2);
flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
q6p6 = _mm_andnot_si128(flat2, q6p6);
flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
- _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
- _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+ _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+ _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
q5p5 = _mm_andnot_si128(flat2, q5p5);
flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
- _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
- _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+ _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+ _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
q4p4 = _mm_andnot_si128(flat2, q4p4);
flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
- _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
- _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+ _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+ _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
q3p3 = _mm_andnot_si128(flat2, q3p3);
flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
- _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
- _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+ _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+ _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
q2p2 = _mm_andnot_si128(flat2, q2p2);
flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
- _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+ _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
q1p1 = _mm_andnot_si128(flat2, q1p1);
flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
- _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
- _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+ _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
q0p0 = _mm_andnot_si128(flat2, q0p0);
flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
- _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+ _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
}
}
return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
}
-void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
const __m128i zero = _mm_set1_epi16(0);
const __m128i one = _mm_set1_epi8(1);
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
__m128i mask, hev, flat, flat2;
__m128i p7, p6, p5;
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
__m128i max_abs_p1p0q1q0;
- p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
- p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
- p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
- p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
- q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
- q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
- q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+ p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+ q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
{
const __m128i abs_p1p0 = abs_diff(p1, p0);
abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
mask = _mm_max_epu8(work, mask);
work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
mask = _mm_max_epu8(work, mask);
- mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_subs_epu8(mask, limit_v);
mask = _mm_cmpeq_epi8(mask, zero);
}
oq0 = _mm_xor_si128(q0, t80);
oq1 = _mm_xor_si128(q1, t80);
- hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+ _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+ _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+ _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+ _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
- _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
}
// wide flat
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
}
}
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh) {
DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
const __m128i zero = _mm_set1_epi16(0);
- const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- const __m128i limit = _mm_load_si128((const __m128i *)_limit);
- const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
__m128i mask, hev, flat;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
__m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
- q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p)));
- q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
- _mm_loadl_epi64((__m128i *)(s + 2 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p)));
- q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+ q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+ q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+ _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
p1q1 = _mm_shuffle_epi32(q1p1, 78);
p0q0 = _mm_shuffle_epi32(q0p0, 78);
abs_p0q0 = abs_diff(q0p0, p0q0);
abs_p1q1 = abs_diff(q1p1, p1q1);
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_subs_epu8(flat, thresh_v);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(abs_p1p0, mask);
work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
mask = _mm_max_epu8(work, mask);
mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_subs_epu8(mask, limit_v);
mask = _mm_cmpeq_epi8(mask, zero);
// flat_mask4
unsigned char *src = s;
{
__m128i workp_a, workp_b, workp_shft;
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+ zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+ zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+ zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+ zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+ zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+ zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+ zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+ zero);
workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
const __m128i t80 = _mm_set1_epi8(0x80);
const __m128i t1 = _mm_set1_epi8(0x1);
const __m128i ps1 =
- _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
const __m128i ps0 =
- _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
const __m128i qs0 =
- _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
const __m128i qs1 =
- _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
+ _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
q1 = _mm_and_si128(flat, q1);
q1 = _mm_or_si128(work_a, q1);
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
work_a = _mm_andnot_si128(flat, work_a);
q2 = _mm_and_si128(flat, q2);
p1 = _mm_and_si128(flat, p1);
p1 = _mm_or_si128(work_a, p1);
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
p2 = _mm_loadl_epi64((__m128i *)flat_op2);
work_a = _mm_andnot_si128(flat, work_a);
p2 = _mm_and_si128(flat, p2);
p2 = _mm_or_si128(work_a, p2);
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
- _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
- _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+ _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
+ _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
+ _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
}
}
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1) {
+void vpx_lpf_horizontal_8_dual_sse2(
+ uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1) {
DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
const __m128i zero = _mm_set1_epi16(0);
const __m128i blimit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+ _mm_load_si128((const __m128i *)blimit1));
const __m128i limit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+ _mm_load_si128((const __m128i *)limit1));
const __m128i thresh =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
- _mm_load_si128((const __m128i *)_thresh1));
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+ _mm_load_si128((const __m128i *)thresh1));
__m128i mask, hev, flat;
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
{
const __m128i abs_p1p0 =
_mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
do {
__m128i workp_a, workp_b, workp_shft;
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+ zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+ zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+ zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+ zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+ zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+ zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+ zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+ zero);
workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
const __m128i t7f = _mm_set1_epi8(0x7f);
const __m128i ps1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
const __m128i ps0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
const __m128i qs0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
const __m128i qs1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
q1 = _mm_and_si128(flat, q1);
q1 = _mm_or_si128(work_a, q1);
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
q2 = _mm_load_si128((__m128i *)flat_oq2);
work_a = _mm_andnot_si128(flat, work_a);
q2 = _mm_and_si128(flat, q2);
p1 = _mm_and_si128(flat, p1);
p1 = _mm_or_si128(work_a, p1);
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
p2 = _mm_load_si128((__m128i *)flat_op2);
work_a = _mm_andnot_si128(flat, work_a);
p2 = _mm_and_si128(flat, p2);
p2 = _mm_or_si128(work_a, p2);
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
}
}
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
- const unsigned char *_blimit0,
- const unsigned char *_limit0,
- const unsigned char *_thresh0,
- const unsigned char *_blimit1,
- const unsigned char *_limit1,
- const unsigned char *_thresh1) {
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
+ const unsigned char *blimit0,
+ const unsigned char *limit0,
+ const unsigned char *thresh0,
+ const unsigned char *blimit1,
+ const unsigned char *limit1,
+ const unsigned char *thresh1) {
const __m128i blimit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+ _mm_load_si128((const __m128i *)blimit1));
const __m128i limit =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+ _mm_load_si128((const __m128i *)limit1));
const __m128i thresh =
- _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
- _mm_load_si128((const __m128i *)_thresh1));
+ _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+ _mm_load_si128((const __m128i *)thresh1));
const __m128i zero = _mm_set1_epi16(0);
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
__m128i mask, hev, flat;
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
// filter_mask and hev_mask
{
const __m128i t7f = _mm_set1_epi8(0x7f);
const __m128i ps1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
const __m128i ps0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
const __m128i qs0 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
const __m128i qs1 =
- _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+ _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
}
}
} while (++idx8x8 < num_8x8_to_transpose);
}
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
unsigned char *dst[2];
// Transpose 8x16
- transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+ transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
// Loop filtering
vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
src[0] = t_dst;
src[1] = t_dst + 8;
dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
+ dst[1] = s - 4 + pitch * 8;
// Transpose back
- transpose(src, 16, dst, p, 2);
+ transpose(src, 16, dst, pitch, 2);
}
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
src[0] = s - 4;
dst[0] = t_dst;
- transpose(src, p, dst, 8, 1);
+ transpose(src, pitch, dst, 8, 1);
// Loop filtering
vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
dst[0] = s - 4;
// Transpose back
- transpose(src, 8, dst, p, 1);
+ transpose(src, 8, dst, pitch, 1);
}
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
const uint8_t *limit0, const uint8_t *thresh0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
unsigned char *dst[2];
// Transpose 8x16
- transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+ transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
// Loop filtering
vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
src[1] = t_dst + 8;
dst[0] = s - 4;
- dst[1] = s - 4 + p * 8;
+ dst[1] = s - 4 + pitch * 8;
// Transpose back
- transpose(src, 16, dst, p, 2);
+ transpose(src, 16, dst, pitch, 2);
}
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh) {
dst[1] = t_dst + 8 * 8;
// Transpose 16x8
- transpose(src, p, dst, 8, 2);
+ transpose(src, pitch, dst, 8, 2);
// Loop filtering
vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
dst[1] = s;
// Transpose back
- transpose(src, 8, dst, p, 2);
+ transpose(src, 8, dst, pitch, 2);
}
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
const uint8_t *blimit, const uint8_t *limit,
const uint8_t *thresh) {
DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
// Transpose 16x16
- transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
- transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+ transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
// Loop filtering
vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
// Transpose back
- transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
- transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+ transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
}
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan_ptr,
- const int16_t *iscan_ptr) {
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
const __m128i zero = _mm_setzero_si128();
const __m256i big_zero = _mm256_setzero_si256();
int index;
__m128i all_zero;
__m128i eob = zero, eob0;
- (void)scan_ptr;
+ (void)scan;
(void)skip_block;
assert(!skip_block);
store_tran_low(coeff0, dqcoeff_ptr);
store_tran_low(coeff1, dqcoeff_ptr + 8);
- eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
- zero);
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
}
// AC only loop.
store_tran_low(coeff0, dqcoeff_ptr + index);
store_tran_low(coeff1, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
- index, zero);
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
eob = _mm_max_epi16(eob, eob0);
}
*eob_ptr = accumulate_eob(eob);
}
-void vpx_quantize_b_32x32_avx(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m256i big_zero = _mm256_setzero_si256();
__m128i all_zero;
__m128i eob = zero, eob0;
- (void)scan_ptr;
+ (void)scan;
(void)n_coeffs;
(void)skip_block;
assert(!skip_block);
store_tran_low(coeff0, dqcoeff_ptr);
store_tran_low(coeff1, dqcoeff_ptr + 8);
- eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
- zero);
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
}
// AC only loop.
store_tran_low(coeff0, dqcoeff_ptr + index);
store_tran_low(coeff1, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
- index, zero);
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
eob = _mm_max_epi16(eob, eob0);
}
const int16_t *round_ptr, const int16_t *quant_ptr,
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan_ptr,
- const int16_t *iscan_ptr) {
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
const __m128i zero = _mm_setzero_si128();
int index = 16;
__m128i cmp_mask0, cmp_mask1;
__m128i eob, eob0;
- (void)scan_ptr;
+ (void)scan;
(void)skip_block;
assert(!skip_block);
store_tran_low(coeff0, dqcoeff_ptr);
store_tran_low(coeff1, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
store_tran_low(coeff0, dqcoeff_ptr + index);
store_tran_low(coeff1, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
- index, zero);
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
eob = _mm_max_epi16(eob, eob0);
index += 16;
const int16_t *quant_shift_ptr,
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ const int16_t *scan, const int16_t *iscan) {
const __m128i zero = _mm_setzero_si128();
int index = 16;
__m128i cmp_mask0, cmp_mask1;
__m128i eob, eob0;
- (void)scan_ptr;
+ (void)scan;
(void)skip_block;
assert(!skip_block);
store_tran_low(coeff0, dqcoeff_ptr);
store_tran_low(coeff1, dqcoeff_ptr + 8);
- eob =
- scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
// AC only loop.
while (index < n_coeffs) {
store_tran_low(coeff0, dqcoeff_ptr + index);
store_tran_low(coeff1, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
- index, zero);
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
eob = _mm_max_epi16(eob, eob0);
index += 16;
*eob_ptr = accumulate_eob(eob);
}
-void vpx_quantize_b_32x32_ssse3(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
int index;
__m128i all_zero;
__m128i eob = zero, eob0;
- (void)scan_ptr;
+ (void)scan;
(void)n_coeffs;
(void)skip_block;
assert(!skip_block);
store_tran_low(coeff0, dqcoeff_ptr);
store_tran_low(coeff1, dqcoeff_ptr + 8);
- eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
- zero);
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
}
// AC only loop.
store_tran_low(coeff0, dqcoeff_ptr + index);
store_tran_low(coeff1, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
- index, zero);
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
eob = _mm_max_epi16(eob, eob0);
}
return _mm_mullo_epi16(qcoeff, dequant);
}
-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
-// to zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
+// zbin to add 1 to the index in 'scan'.
static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
const __m128i zbin_mask0,
const __m128i zbin_mask1,
- const int16_t *scan_ptr, const int index,
+ const int16_t *scan, const int index,
const __m128i zero) {
const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
- __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
- __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
__m128i eob0, eob1;
// Add one to convert from indices to counts
scan0 = _mm_sub_epi16(scan0, zbin_mask0);
#include "vpx/vpx_integer.h"
static INLINE void calc_final(const __m256i *const sums /*[4]*/,
- uint32_t res[4]) {
+ uint32_t sad_array[4]) {
const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
const __m256i t2 = _mm256_hadd_epi32(t0, t1);
const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
_mm256_extractf128_si256(t2, 1));
- _mm_storeu_si128((__m128i *)res, sum);
+ _mm_storeu_si128((__m128i *)sad_array, sum);
}
-void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
+ uint32_t sad_array[4]) {
int i;
const uint8_t *refs[4];
__m256i sums[4];
- refs[0] = ref[0];
- refs[1] = ref[1];
- refs[2] = ref[2];
- refs[3] = ref[3];
+ refs[0] = ref_array[0];
+ refs[1] = ref_array[1];
+ refs[2] = ref_array[2];
+ refs[3] = ref_array[3];
sums[0] = _mm256_setzero_si256();
sums[1] = _mm256_setzero_si256();
sums[2] = _mm256_setzero_si256();
for (i = 0; i < 32; i++) {
__m256i r[4];
- // load src and all refs
- const __m256i s = _mm256_load_si256((const __m256i *)src);
+ // load src and all ref[]
+ const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
- // sum of the absolute differences between every ref-i to src
+ // sum of the absolute differences between every ref[] to src
r[0] = _mm256_sad_epu8(r[0], s);
r[1] = _mm256_sad_epu8(r[1], s);
r[2] = _mm256_sad_epu8(r[2], s);
r[3] = _mm256_sad_epu8(r[3], s);
- // sum every ref-i
+ // sum every ref[]
sums[0] = _mm256_add_epi32(sums[0], r[0]);
sums[1] = _mm256_add_epi32(sums[1], r[1]);
sums[2] = _mm256_add_epi32(sums[2], r[2]);
sums[3] = _mm256_add_epi32(sums[3], r[3]);
- src += src_stride;
+ src_ptr += src_stride;
refs[0] += ref_stride;
refs[1] += ref_stride;
refs[2] += ref_stride;
refs[3] += ref_stride;
}
- calc_final(sums, res);
+ calc_final(sums, sad_array);
}
-void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
+ uint32_t sad_array[4]) {
__m256i sums[4];
int i;
const uint8_t *refs[4];
- refs[0] = ref[0];
- refs[1] = ref[1];
- refs[2] = ref[2];
- refs[3] = ref[3];
+ refs[0] = ref_array[0];
+ refs[1] = ref_array[1];
+ refs[2] = ref_array[2];
+ refs[3] = ref_array[3];
sums[0] = _mm256_setzero_si256();
sums[1] = _mm256_setzero_si256();
sums[2] = _mm256_setzero_si256();
for (i = 0; i < 64; i++) {
__m256i r_lo[4], r_hi[4];
- // load 64 bytes from src and all refs
- const __m256i s_lo = _mm256_load_si256((const __m256i *)src);
- const __m256i s_hi = _mm256_load_si256((const __m256i *)(src + 32));
+ // load 64 bytes from src and all ref[]
+ const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
+ const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
- // sum of the absolute differences between every ref-i to src
+ // sum of the absolute differences between every ref[] to src
r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
- // sum every ref-i
+ // sum every ref[]
sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
- src += src_stride;
+ src_ptr += src_stride;
refs[0] += ref_stride;
refs[1] += ref_stride;
refs[2] += ref_stride;
refs[3] += ref_stride;
}
- calc_final(sums, res);
+ calc_final(sums, sad_array);
}
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride,
uint32_t res[4]) {
__m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
__m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
int i;
const uint8_t *ref0, *ref1, *ref2, *ref3;
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
+ ref0 = ref_array[0];
+ ref1 = ref_array[1];
+ ref2 = ref_array[2];
+ ref3 = ref_array[3];
sum_ref0 = _mm512_set1_epi16(0);
sum_ref1 = _mm512_set1_epi16(0);
sum_ref2 = _mm512_set1_epi16(0);
sum_ref3 = _mm512_set1_epi16(0);
for (i = 0; i < 64; i++) {
- // load src and all refs
- src_reg = _mm512_loadu_si512((const __m512i *)src);
+ // load src and all ref[]
+ src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);
ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
- // sum of the absolute differences between every ref-i to src
+ // sum of the absolute differences between every ref[] to src
ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
- // sum every ref-i
+ // sum every ref[]
sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
- src += src_stride;
+ src_ptr += src_stride;
ref0 += ref_stride;
ref1 += ref_stride;
ref2 += ref_stride;
{
__m256i sum256;
__m128i sum128;
- // in sum_ref-i the result is saved in the first 4 bytes
+ // in sum_ref[] the result is saved in the first 4 bytes
// the other 4 bytes are zeroed.
// sum_ref1 and sum_ref3 are shifted left by 4 bytes
sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
- // merge every 64 bit from each sum_ref-i
+ // merge every 64 bit from each sum_ref[]
sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
;Compute max and min values of a pixel
mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
+ movsxd rcx, DWORD PTR arg(6) ;bd
movq xmm0, rdx
movq xmm1, rcx
pshufd xmm0, xmm0, 0b
;Compute max and min values of a pixel
mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
+ movsxd rcx, DWORD PTR arg(6) ;bd
movq xmm0, rdx
movq xmm1, rcx
pshufd xmm0, xmm0, 0b
pshufd xmm3, xmm3, 0
mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
+ movsxd rcx, DWORD PTR arg(6) ;bd
movq xmm5, rdx
movq xmm2, rcx
pshufd xmm5, xmm5, 0b
pshufd xmm4, xmm4, 0
mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
+ movsxd rcx, DWORD PTR arg(6) ;bd
movq xmm8, rdx
movq xmm5, rcx
pshufd xmm8, xmm8, 0b