From a185bc3350ca6519ad018f84189ea289e362e7d0 Mon Sep 17 00:00:00 2001 From: Deb Mukherjee Date: Tue, 20 May 2014 10:48:54 -0700 Subject: [PATCH] Extends temporal filtering to work for 422 data This is needed for profiles 1 and 2. Change-Id: I5dd7644c2932d055ab89e050d4be7d4117cd1028 --- vp9/common/vp9_rtcd_defs.pl | 2 +- vp9/encoder/vp9_temporal_filter.c | 47 +++++++++--------- .../x86/vp9_temporal_filter_apply_sse2.asm | 49 ++++++++++--------- 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 1037bfbc3..182739620 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -772,7 +772,7 @@ $vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4; add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_full_range_search/; -add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; +add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; specialize qw/vp9_temporal_filter_apply sse2/; } diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 6eff20080..f501971e2 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -34,7 +34,8 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, - int uv_block_size, + int uv_block_width, + int uv_block_height, int mv_row, int mv_col, uint8_t *pred, @@ -47,7 +48,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, enum mv_precision mv_precision_uv; int uv_stride; - if (uv_block_size == 8) { + if (uv_block_width == 8) { uv_stride = (stride + 1) >> 1; mv_precision_uv = MV_PRECISION_Q4; } else { @@ -64,18 +65,18 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, kernel, MV_PRECISION_Q3, x, y); vp9_build_inter_predictor(u_mb_ptr, uv_stride, - &pred[256], uv_block_size, + &pred[256], uv_block_width, &mv, scale, - uv_block_size, uv_block_size, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y); vp9_build_inter_predictor(v_mb_ptr, uv_stride, - &pred[512], uv_block_size, + &pred[512], uv_block_width, &mv, scale, - uv_block_size, uv_block_size, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, y); } @@ -91,7 +92,8 @@ void vp9_temporal_filter_init() { void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, - unsigned int block_size, + unsigned int block_width, + unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, @@ -101,8 +103,8 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, int byte = 0; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; - for (i = 0, k = 0; i < block_size; i++) { - for (j = 0; j < block_size; j++, k++) { + for (i = 0, k = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++, k++) { int src_byte = frame1[byte]; int pixel_value = *frame2++; @@ -127,7 +129,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1, byte++; } - byte += stride - block_size; + byte += stride - block_width; } } @@ -204,14 +206,12 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, uint8_t *dst1, *dst2; DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3); const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; + const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; // Save input state uint8_t* input_buffer[MAX_MB_PLANE]; int i; - // TODO(aconverse): Add 4:2:2 support - assert(mbd->plane[1].subsampling_x == mbd->plane[1].subsampling_y); - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; @@ -275,7 +275,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, cpi->frames[frame]->u_buffer + mb_uv_offset, cpi->frames[frame]->v_buffer + mb_uv_offset, cpi->frames[frame]->y_stride, - mb_uv_height, + mb_uv_width, mb_uv_height, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, predictor, scale, @@ -283,16 +283,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, // Apply the filter (YUV) vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, strength, filter_weight, + predictor, 16, 16, + strength, filter_weight, accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_height, strength, + predictor + 256, + mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 256, count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_height, strength, + predictor + 512, + mb_uv_width, mb_uv_height, strength, filter_weight, accumulator + 512, count + 512); } @@ -321,7 +322,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; for (i = 0, k = 256; i < mb_uv_height; i++) { - for (j = 0; j < mb_uv_height; j++, k++) { + for (j = 0; j < mb_uv_width; j++, k++) { int m = k + 256; // U @@ -339,13 +340,13 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, // move to next pixel byte++; } - byte += stride - mb_uv_height; + byte += stride - mb_uv_width; } mb_y_offset += 16; - mb_uv_offset += mb_uv_height; + mb_uv_offset += mb_uv_width; } mb_y_offset += 16 * (f->y_stride - mb_cols); - mb_uv_offset += mb_uv_height * (f->uv_stride - mb_cols); + mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; } // Restore input state diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm index d2d13b383..673e0b3a6 100644 --- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm +++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm @@ -15,41 +15,45 @@ ; (unsigned char *frame1, | 0 ; unsigned int stride, | 1 ; unsigned char *frame2, | 2 -; unsigned int block_size, | 3 -; int strength, | 4 -; int filter_weight, | 5 -; unsigned int *accumulator, | 6 -; unsigned short *count) | 7 +; unsigned int block_width, | 3 +; unsigned int block_height, | 4 +; int strength, | 5 +; int filter_weight, | 6 +; unsigned int *accumulator, | 7 +; unsigned short *count) | 8 global sym(vp9_temporal_filter_apply_sse2) PRIVATE sym(vp9_temporal_filter_apply_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 + SHADOW_ARGS_TO_STACK 9 SAVE_XMM 7 GET_GOT rbx push rsi push rdi ALIGN_STACK 16, rax - %define block_size 0 - %define strength 16 - %define filter_weight 32 - %define rounding_bit 48 - %define rbp_backup 64 - %define stack_size 80 + %define block_width 0 + %define block_height 16 + %define strength 32 + %define filter_weight 48 + %define rounding_bit 64 + %define rbp_backup 80 + %define stack_size 96 sub rsp, stack_size mov [rsp + rbp_backup], rbp ; end prolog mov rdx, arg(3) - mov [rsp + block_size], rdx - movd xmm6, arg(4) + mov [rsp + block_width], rdx + mov rdx, arg(4) + mov [rsp + block_height], rdx + movd xmm6, arg(5) movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read ; calculate the rounding bit outside the loop ; 0x8000 >> (16 - strength) mov rdx, 16 - sub rdx, arg(4) ; 16 - strength + sub rdx, arg(5) ; 16 - strength movq xmm4, rdx ; can't use rdx w/ shift movdqa xmm5, [GLOBAL(_const_top_bit)] psrlw xmm5, xmm4 @@ -57,11 +61,11 @@ sym(vp9_temporal_filter_apply_sse2): mov rsi, arg(0) ; src/frame1 mov rdx, arg(2) ; predictor frame - mov rdi, arg(6) ; accumulator - mov rax, arg(7) ; count + mov rdi, arg(7) ; accumulator + mov rax, arg(8) ; count ; dup the filter weight and store for later - movd xmm0, arg(5) ; filter_weight + movd xmm0, arg(6) ; filter_weight pshuflw xmm0, xmm0, 0 punpcklwd xmm0, xmm0 movdqa [rsp + filter_weight], xmm0 @@ -69,10 +73,11 @@ sym(vp9_temporal_filter_apply_sse2): mov rbp, arg(1) ; stride pxor xmm7, xmm7 ; zero for extraction - lea rcx, [rdx + 16*16*1] - cmp dword ptr [rsp + block_size], 8 + mov rcx, [rsp + block_width] + imul rcx, [rsp + block_height] + add rcx, rdx + cmp dword ptr [rsp + block_width], 8 jne .temporal_filter_apply_load_16 - lea rcx, [rdx + 8*8*1] .temporal_filter_apply_load_8: movq xmm0, [rsi] ; first row @@ -178,7 +183,7 @@ sym(vp9_temporal_filter_apply_sse2): cmp rdx, rcx je .temporal_filter_apply_epilog pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_size], 16 + cmp dword ptr [rsp + block_width], 16 je .temporal_filter_apply_load_16 jmp .temporal_filter_apply_load_8 -- 2.40.0