add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
+ add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+ specialize qw/vp9_block_error_fp sse2/;
+
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
*out_dist_sum += dist << 4;
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+ int *skippable, int64_t *sse, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int var_y, sse_y;
+ (void)plane;
+ (void)tx_size;
+ model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+ *sse = INT_MAX;
+ *skippable = 0;
+ return;
+}
+#else
static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
int *skippable, int64_t *sse, int plane,
BLOCK_SIZE bsize, TX_SIZE tx_size) {
const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-#if CONFIG_VP9_HIGHBITDEPTH
- unsigned int var_y, sse_y;
- model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
- *sse = INT_MAX;
- *skippable = 0;
- return;
-#else
(void)cpi;
-#endif
-
vp9_subtract_plane(x, bsize, plane);
-
*skippable = 1;
- *rate = 0;
- *dist = 0;
- *sse = 0;
-
// Keep track of the row and column of the blocks we use so that we know
// if we are in the unrestricted motion border.
for (r = 0; r < max_blocks_high; r += block_step) {
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
int i, j;
const int16_t *src_diff;
- int64_t this_sse;
txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j);
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
assert(0);
break;
}
+ *skippable &= (*eob == 0);
+ }
+ block += step;
+ }
+ }
+
+ if (*skippable && *sse < INT64_MAX) {
+ *dist = (*sse << 6) >> shift;
+ *sse = *dist;
+ return;
+ }
- *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift;
+ block = 0;
+ *rate = 0;
+ *dist = 0;
+ *sse = (*sse << 6) >> shift;
+ for (r = 0; r < max_blocks_high; r += block_step) {
+ for (c = 0; c < num_4x4_w; c += block_step) {
+ if (c < max_blocks_wide) {
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
if (*eob == 1)
*rate += (int)abs(qcoeff[0]);
else if (*eob > 1)
*rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
- *sse += (this_sse >> shift);
- *skippable &= (*eob == 0);
+ *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
}
block += step;
}
*rate <<= 8;
*rate *= 6;
}
+#endif
static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
int i, j;
int rate;
int64_t dist;
- int64_t this_sse;
+ int64_t this_sse = INT64_MAX;
int is_skippable;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
}
if (bsize <= BLOCK_16X16) {
+ this_sse = (int64_t)sse_y;
block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
&this_sse, 0, bsize, mbmi->tx_size);
x->skip_txfm[0] = is_skippable;
movd edx, m5
%endif
RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+; intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 8, uqc, dqc, size
+ pxor m4, m4 ; sse accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+ punpckldq m7, m1, m5
+ paddq m4, m0
+ punpckhdq m1, m5
+ paddq m4, m7
+ paddq m4, m1
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ paddq m4, m5
+%if ARCH_X86_64
+ movq rax, m4
+%else
+ pshufd m5, m4, 0x1
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET