2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include "vpx_ports/config.h"
14 #include "reconinter.h"
19 #include "reconintra.h"
21 #include "vpx_mem/vpx_mem.h"
23 #if CONFIG_RUNTIME_CPU_DETECT
24 #define IF_RTCD(x) (x)
26 #define IF_RTCD(x) NULL
28 void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
30 unsigned char *src_ptr = (*(be->base_src) + be->src);
31 short *diff_ptr = be->src_diff;
32 unsigned char *pred_ptr = bd->predictor;
33 int src_stride = be->src_stride;
37 for (r = 0; r < 4; r++)
39 for (c = 0; c < 4; c++)
41 diff_ptr[c] = src_ptr[c] - pred_ptr[c];
46 src_ptr += src_stride;
50 void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
52 short *udiff = diff + 256;
53 short *vdiff = diff + 320;
54 unsigned char *upred = pred + 256;
55 unsigned char *vpred = pred + 320;
59 for (r = 0; r < 8; r++)
61 for (c = 0; c < 8; c++)
63 udiff[c] = usrc[c] - upred[c];
71 for (r = 0; r < 8; r++)
73 for (c = 0; c < 8; c++)
75 vdiff[c] = vsrc[c] - vpred[c];
84 void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
88 for (r = 0; r < 16; r++)
90 for (c = 0; c < 16; c++)
92 diff[c] = src[c] - pred[c];
101 static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
103 ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
104 ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
107 void vp8_build_dcblock(MACROBLOCK *x)
109 short *src_diff_ptr = &x->src_diff[384];
112 for (i = 0; i < 16; i++)
114 src_diff_ptr[i] = x->coeff[i * 16];
118 void vp8_transform_mbuv(MACROBLOCK *x)
122 for (i = 16; i < 24; i += 2)
124 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
125 &x->block[i].coeff[0], 16);
130 void vp8_transform_intra_mby(MACROBLOCK *x)
134 for (i = 0; i < 16; i += 2)
136 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
137 &x->block[i].coeff[0], 32);
140 // build dc block from 16 y dc values
141 vp8_build_dcblock(x);
143 // do 2nd order transform on the dc block
144 x->short_walsh4x4(&x->block[24].src_diff[0],
145 &x->block[24].coeff[0], 8);
150 void vp8_transform_mb(MACROBLOCK *x)
154 for (i = 0; i < 16; i += 2)
156 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
157 &x->block[i].coeff[0], 32);
160 // build dc block from 16 y dc values
161 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
162 vp8_build_dcblock(x);
164 for (i = 16; i < 24; i += 2)
166 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
167 &x->block[i].coeff[0], 16);
170 // do 2nd order transform on the dc block
171 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
172 x->short_walsh4x4(&x->block[24].src_diff[0],
173 &x->block[24].coeff[0], 8);
177 void vp8_transform_mby(MACROBLOCK *x)
181 for (i = 0; i < 16; i += 2)
183 x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
184 &x->block[i].coeff[0], 32);
187 // build dc block from 16 y dc values
188 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)
190 vp8_build_dcblock(x);
191 x->short_walsh4x4(&x->block[24].src_diff[0],
192 &x->block[24].coeff[0], 8);
197 void vp8_stuff_inter16x16(MACROBLOCK *x)
199 vp8_build_inter_predictors_mb_s(&x->e_mbd);
201 // recon = copy from predictors to destination
203 BLOCKD *b = &x->e_mbd.block[0];
204 unsigned char *pred_ptr = b->predictor;
205 unsigned char *dst_ptr = *(b->base_dst) + b->dst;
206 int stride = b->dst_stride;
210 vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
212 b = &x->e_mbd.block[16];
213 pred_ptr = b->predictor;
214 dst_ptr = *(b->base_dst) + b->dst;
215 stride = b->dst_stride;
218 vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
220 b = &x->e_mbd.block[20];
221 pred_ptr = b->predictor;
222 dst_ptr = *(b->base_dst) + b->dst;
223 stride = b->dst_stride;
226 vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
231 #if !(CONFIG_REALTIME_ONLY)
232 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
233 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
235 typedef struct vp8_token_state vp8_token_state;
237 struct vp8_token_state{
245 // TODO: experiments to find optimal multiple numbers
250 static const int plane_rd_mult[4]=
258 void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
259 ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
260 const VP8_ENCODER_RTCD *rtcd)
264 vp8_token_state tokens[17][2];
265 unsigned best_mask[2];
266 const short *dequant_ptr;
267 const short *coeff_ptr;
292 int err_mult = plane_rd_mult[type];
295 d = &mb->e_mbd.block[ib];
297 /* Enable this to test the effect of RDO as a replacement for the dynamic
298 * zero bin instead of an augmentation of it.
301 vp8_strict_quantize_b(b, d);
304 dequant_ptr = &d->dequant[0][0];
305 coeff_ptr = &b->coeff[0];
306 qcoeff_ptr = d->qcoeff;
307 dqcoeff_ptr = d->dqcoeff;
311 /* Now set up a Viterbi trellis to evaluate alternative roundings. */
312 /* TODO: These should vary with the block type, since the quantizer does. */
313 rdmult = (mb->rdmult << 2)*err_mult;
315 best_mask[0] = best_mask[1] = 0;
316 /* Initialize the sentinel node of the trellis. */
317 tokens[eob][0].rate = 0;
318 tokens[eob][0].error = 0;
319 tokens[eob][0].next = 16;
320 tokens[eob][0].token = DCT_EOB_TOKEN;
321 tokens[eob][0].qc = 0;
322 *(tokens[eob] + 1) = *(tokens[eob] + 0);
324 for (i = eob; i-- > i0;)
330 rc = vp8_default_zig_zag1d[i];
332 /* Only add a trellis state for non-zero coefficients. */
336 error0 = tokens[next][0].error;
337 error1 = tokens[next][1].error;
338 /* Evaluate the first possibility for this state. */
339 rate0 = tokens[next][0].rate;
340 rate1 = tokens[next][1].rate;
341 t0 = (vp8_dct_value_tokens_ptr + x)->Token;
342 /* Consider both possible successor states. */
345 band = vp8_coef_bands[i + 1];
346 pt = vp8_prev_token_class[t0];
348 mb->token_costs[type][band][pt][tokens[next][0].token];
350 mb->token_costs[type][band][pt][tokens[next][1].token];
352 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
353 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
354 if (rd_cost0 == rd_cost1)
356 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
357 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
359 /* And pick the best. */
360 best = rd_cost1 < rd_cost0;
361 base_bits = *(vp8_dct_value_cost_ptr + x);
362 dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
364 tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
365 tokens[i][0].error = d2 + (best ? error1 : error0);
366 tokens[i][0].next = next;
367 tokens[i][0].token = t0;
369 best_mask[0] |= best << i;
370 /* Evaluate the second possibility for this state. */
371 rate0 = tokens[next][0].rate;
372 rate1 = tokens[next][1].rate;
374 if((abs(x)*dequant_ptr[rc]>abs(coeff_ptr[rc])) &&
375 (abs(x)*dequant_ptr[rc]<abs(coeff_ptr[rc])+dequant_ptr[rc]))
386 /* Consider both possible successor states. */
389 /* If we reduced this coefficient to zero, check to see if
390 * we need to move the EOB back here.
392 t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
393 DCT_EOB_TOKEN : ZERO_TOKEN;
394 t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
395 DCT_EOB_TOKEN : ZERO_TOKEN;
399 t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;
403 band = vp8_coef_bands[i + 1];
404 if(t0!=DCT_EOB_TOKEN)
406 pt = vp8_prev_token_class[t0];
407 rate0 += mb->token_costs[type][band][pt][
408 tokens[next][0].token];
410 if(t1!=DCT_EOB_TOKEN)
412 pt = vp8_prev_token_class[t1];
413 rate1 += mb->token_costs[type][band][pt][
414 tokens[next][1].token];
418 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
419 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
420 if (rd_cost0 == rd_cost1)
422 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
423 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
425 /* And pick the best. */
426 best = rd_cost1 < rd_cost0;
427 base_bits = *(vp8_dct_value_cost_ptr + x);
431 dx -= (dequant_ptr[rc] + sz) ^ sz;
434 tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
435 tokens[i][1].error = d2 + (best ? error1 : error0);
436 tokens[i][1].next = next;
437 tokens[i][1].token =best?t1:t0;
439 best_mask[1] |= best << i;
440 /* Finally, make this the new head of the trellis. */
443 /* There's no choice to make for a zero coefficient, so we don't
444 * add a new trellis node, but we do need to update the costs.
448 band = vp8_coef_bands[i + 1];
449 t0 = tokens[next][0].token;
450 t1 = tokens[next][1].token;
451 /* Update the cost of each path if we're past the EOB token. */
452 if (t0 != DCT_EOB_TOKEN)
454 tokens[next][0].rate += mb->token_costs[type][band][0][t0];
455 tokens[next][0].token = ZERO_TOKEN;
457 if (t1 != DCT_EOB_TOKEN)
459 tokens[next][1].rate += mb->token_costs[type][band][0][t1];
460 tokens[next][1].token = ZERO_TOKEN;
462 /* Don't update next, because we didn't add a new node. */
466 /* Now pick the best path through the whole trellis. */
467 band = vp8_coef_bands[i + 1];
468 VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
469 rate0 = tokens[next][0].rate;
470 rate1 = tokens[next][1].rate;
471 error0 = tokens[next][0].error;
472 error1 = tokens[next][1].error;
473 t0 = tokens[next][0].token;
474 t1 = tokens[next][1].token;
475 rate0 += mb->token_costs[type][band][pt][t0];
476 rate1 += mb->token_costs[type][band][pt][t1];
477 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
478 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
479 if (rd_cost0 == rd_cost1)
481 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
482 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
484 best = rd_cost1 < rd_cost0;
486 for (i = next; i < eob; i = next)
488 x = tokens[i][best].qc;
491 rc = vp8_default_zig_zag1d[i];
493 dqcoeff_ptr[rc] = x * dequant_ptr[rc];
494 next = tokens[i][best].next;
495 best = (best_mask[best] >> i) & 1;
500 *a = *l = (d->eob != !type);
503 void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
508 ENTROPY_CONTEXT_PLANES t_above, t_left;
512 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
513 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
515 ta = (ENTROPY_CONTEXT *)&t_above;
516 tl = (ENTROPY_CONTEXT *)&t_left;
518 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
519 && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
520 type = has_2nd_order ? 0 : 3;
522 for (b = 0; b < 16; b++)
524 vp8_optimize_b(x, b, type,
525 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
528 for (b = 16; b < 20; b++)
530 vp8_optimize_b(x, b, vp8_block2type[b],
531 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
534 for (b = 20; b < 24; b++)
536 vp8_optimize_b(x, b, vp8_block2type[b],
537 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
544 vp8_optimize_b(x, b, vp8_block2type[b],
545 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
550 void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
556 ENTROPY_CONTEXT_PLANES t_above, t_left;
560 if (!x->e_mbd.above_context)
563 if (!x->e_mbd.left_context)
566 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
567 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
569 ta = (ENTROPY_CONTEXT *)&t_above;
570 tl = (ENTROPY_CONTEXT *)&t_left;
572 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
573 && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
574 type = has_2nd_order ? 0 : 3;
576 for (b = 0; b < 16; b++)
578 vp8_optimize_b(x, b, type,
579 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
586 vp8_optimize_b(x, b, vp8_block2type[b],
587 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
591 void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
594 ENTROPY_CONTEXT_PLANES t_above, t_left;
598 if (!x->e_mbd.above_context)
601 if (!x->e_mbd.left_context)
604 vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
605 vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
607 ta = (ENTROPY_CONTEXT *)&t_above;
608 tl = (ENTROPY_CONTEXT *)&t_left;
610 for (b = 16; b < 20; b++)
612 vp8_optimize_b(x, b, vp8_block2type[b],
613 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
616 for (b = 20; b < 24; b++)
618 vp8_optimize_b(x, b, vp8_block2type[b],
619 ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd);
625 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
627 vp8_build_inter_predictors_mb(&x->e_mbd);
629 vp8_subtract_mb(rtcd, x);
635 #if !(CONFIG_REALTIME_ONLY)
636 if (x->optimize && x->rddiv > 1)
637 vp8_optimize_mb(x, rtcd);
640 vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
642 vp8_recon16x16mb(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
646 /* this funciton is used by first pass only */
647 void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
649 vp8_build_inter_predictors_mby(&x->e_mbd);
651 ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
653 vp8_transform_mby(x);
657 vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
659 vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
663 void vp8_encode_inter16x16uv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
665 vp8_build_inter_predictors_mbuv(&x->e_mbd);
667 ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
669 vp8_transform_mbuv(x);
671 vp8_quantize_mbuv(x);
673 vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
675 vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
679 void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
681 vp8_build_inter_predictors_mbuv(&x->e_mbd);
682 ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
684 vp8_transform_mbuv(x);
686 vp8_quantize_mbuv(x);