From 657f3e9f14750c90c187da4d5fc8ce6f116f1403 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 28 Apr 2017 06:34:21 -0700 Subject: [PATCH] Use uint32_t for accumulator Be specific about the data type size. Use convenience macro vp9_zero_array. Change-Id: I5fadf7dbd408befb73820d85db0be4832e8cfcbd --- vp9/common/vp9_rtcd_defs.pl | 4 ++-- vp9/encoder/vp9_temporal_filter.c | 11 ++++++----- vp9/encoder/x86/temporal_filter_sse4.c | 9 +++------ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 47983aeef..da449e254 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -197,7 +197,7 @@ $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8; add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; specialize qw/vp9_diamond_search_sad avx/; -add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; +add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; specialize qw/vp9_temporal_filter_apply sse4_1/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { @@ -217,7 +217,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; + add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; } # End vp9_high encoder functions diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 6ca5be090..cc6b36c9d 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -13,6 +13,7 @@ #include #include "vp9/common/vp9_alloccommon.h" +#include "vp9/common/vp9_common.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_reconinter.h" @@ -94,7 +95,7 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, + int filter_weight, uint32_t *accumulator, uint16_t *count) { unsigned int i, j, k; int modifier; @@ -162,7 +163,7 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, void vp9_highbd_temporal_filter_apply_c( const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, uint16_t *count) { + int filter_weight, uint32_t *accumulator, uint16_t *count) { const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); unsigned int i, j, k; @@ -292,7 +293,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, unsigned int filter_weight; int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; - DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]); DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); MACROBLOCKD *mbd = &td->mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; @@ -339,8 +340,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, int stride; MV ref_mv; - memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); - memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); + vp9_zero_array(accumulator, 16 * 16 * 3); + vp9_zero_array(count, 16 * 16 * 3); td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); td->mb.mv_limits.col_max = diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index 0419aa170..be4cd8685 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -165,7 +165,7 @@ static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, // Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, - uint16_t *count, unsigned int *accumulator) { + uint16_t *count, uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); @@ -194,7 +194,7 @@ static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, static void accumulate_and_store_16(const __m128i sum_0_u16, const __m128i sum_1_u16, const uint8_t *pred, uint16_t *count, - unsigned int *accumulator) { + uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), @@ -237,7 +237,7 @@ static void accumulate_and_store_16(const __m128i sum_0_u16, void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, const uint8_t *b, unsigned int width, unsigned int height, int strength, - int weight, unsigned int *accumulator, + int weight, uint32_t *accumulator, uint16_t *count) { unsigned int h; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; @@ -250,9 +250,6 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, assert(width == 8 || width == 16); - // TODO(johannkoenig) Use uint32_t for accumulator. - assert(sizeof(*accumulator) == sizeof(uint32_t)); - if (width == 8) { __m128i sum_row_a, sum_row_b, sum_row_c; __m128i mul_constants = _mm_setr_epi16( -- 2.40.0