From 15149484ec75e77a6fa4b0ce2e4ede5cb6a63c39 Mon Sep 17 00:00:00 2001 From: Ryan Lei Date: Tue, 25 Oct 2016 18:48:43 -0700 Subject: [PATCH] Add parallel-deblocking experiment This commit is a manual cherry-pick from aom/master: 42ff3881ace1564aac9debae86ef37a8deb8d381 Change-Id: I4a3cdb939b7b96a3aa27f6a00da7a0e73222f3f3 --- av1/common/loopfilter.c | 156 +++++++++++++++---- av1/common/loopfilter.h | 31 ++-- av1/common/thread_common.c | 299 ++++++++++++++++++++++++++++++------- configure | 1 + 4 files changed, 392 insertions(+), 95 deletions(-) diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c index d0b897c74..dc7ee188d 100644 --- a/av1/common/loopfilter.c +++ b/av1/common/loopfilter.c @@ -1183,9 +1183,10 @@ static void highbd_filter_selectively_vert( } #endif // CONFIG_AOM_HIGHBITDEPTH -void av1_filter_block_plane_non420(AV1_COMMON *cm, - struct macroblockd_plane *plane, - MODE_INFO **mib, int mi_row, int mi_col) { +void av1_filter_block_plane_non420_ver(AV1_COMMON *cm, + struct macroblockd_plane *plane, + MODE_INFO **mib, int mi_row, + int mi_col) { const int ss_x = plane->subsampling_x; const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_y; @@ -1369,6 +1370,22 @@ void av1_filter_block_plane_non420(AV1_COMMON *cm, // Now do horizontal pass dst->buf = dst0; +} + +void av1_filter_block_plane_non420_hor(AV1_COMMON *cm, + struct macroblockd_plane *plane, + int mi_row) { + const int ss_y = plane->subsampling_y; + const int row_step = 1 << ss_y; + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 }; + unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 }; + unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 }; + unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 }; + uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE]; + int r; + for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) { const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; @@ -1404,11 +1421,12 @@ void av1_filter_block_plane_non420(AV1_COMMON *cm, #endif // CONFIG_AOM_HIGHBITDEPTH dst->buf += MI_SIZE * dst->stride; } + dst->buf = dst0; } -void av1_filter_block_plane_ss00(AV1_COMMON *const cm, - struct macroblockd_plane *const plane, - int mi_row, LOOP_FILTER_MASK *lfm) { +void av1_filter_block_plane_ss00_ver(AV1_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm) { struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; int r; @@ -1452,10 +1470,20 @@ void av1_filter_block_plane_ss00(AV1_COMMON *const cm, // Horizontal pass dst->buf = dst0; - mask_16x16 = lfm->above_y[TX_16X16]; - mask_8x8 = lfm->above_y[TX_8X8]; - mask_4x4 = lfm->above_y[TX_4X4]; - mask_4x4_int = lfm->int_4x4_y; +} + +void av1_filter_block_plane_ss00_hor(AV1_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm) { + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + int r; + uint64_t mask_16x16 = lfm->above_y[TX_16X16]; + uint64_t mask_8x8 = lfm->above_y[TX_8X8]; + uint64_t mask_4x4 = lfm->above_y[TX_4X4]; + uint64_t mask_4x4_int = lfm->int_4x4_y; + + assert(plane->subsampling_x == 0 && plane->subsampling_y == 0); for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) { unsigned int mask_16x16_r; @@ -1495,11 +1523,13 @@ void av1_filter_block_plane_ss00(AV1_COMMON *const cm, mask_4x4 >>= MI_SIZE; mask_4x4_int >>= MI_SIZE; } + // restore the buf pointer in case there is additional filter pass. + dst->buf = dst0; } -void av1_filter_block_plane_ss11(AV1_COMMON *const cm, - struct macroblockd_plane *const plane, - int mi_row, LOOP_FILTER_MASK *lfm) { +void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm) { struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; int r, c; @@ -1554,10 +1584,20 @@ void av1_filter_block_plane_ss11(AV1_COMMON *const cm, // Horizontal pass dst->buf = dst0; - mask_16x16 = lfm->above_uv[TX_16X16]; - mask_8x8 = lfm->above_uv[TX_8X8]; - mask_4x4 = lfm->above_uv[TX_4X4]; - mask_4x4_int = lfm->above_int_4x4_uv; +} + +void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm) { + struct buf_2d *const dst = &plane->dst; + uint8_t *const dst0 = dst->buf; + int r; + uint64_t mask_16x16 = lfm->above_uv[TX_16X16]; + uint64_t mask_8x8 = lfm->above_uv[TX_8X8]; + uint64_t mask_4x4 = lfm->above_uv[TX_4X4]; + uint64_t mask_4x4_int = lfm->above_int_4x4_uv; + + assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) { const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1; @@ -1600,6 +1640,8 @@ void av1_filter_block_plane_ss11(AV1_COMMON *const cm, mask_4x4 >>= MI_SIZE / 2; mask_4x4_int >>= MI_SIZE / 2; } + // restore the buf pointer in case there is additional filter pass. + dst->buf = dst0; } void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, @@ -1622,12 +1664,14 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); - for (plane = 0; plane < num_planes; ++plane) - av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row, - mi_col); + for (plane = 0; plane < num_planes; ++plane) { + av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + } } } -#else +#else // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES const int num_planes = y_only ? 1 : MAX_MB_PLANE; int mi_row, mi_col; enum lf_path path; @@ -1641,7 +1685,34 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, path = LF_PATH_444; else path = LF_PATH_SLOW; +#if CONFIG_PARALLEL_DEBLOCKING + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { + int plane; + + av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + // TODO(JBB): Make setup_mask work for non 420. + av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); + + av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm); + for (plane = 1; plane < num_planes; ++plane) { + switch (path) { + case LF_PATH_420: + av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm); + break; + case LF_PATH_444: + av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm); + break; + case LF_PATH_SLOW: + av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + break; + } + } + } + } for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { @@ -1652,23 +1723,56 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, // TODO(JBB): Make setup_mask work for non 420. av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); - av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm); + av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm); for (plane = 1; plane < num_planes; ++plane) { switch (path) { case LF_PATH_420: - av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm); + av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm); break; case LF_PATH_444: - av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm); + av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm); break; case LF_PATH_SLOW: - av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, - mi_row, mi_col); + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + break; + } + } + } + } +#else // CONFIG_PARALLEL_DEBLOCKING + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { + int plane; + + av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); + + // TODO(JBB): Make setup_mask work for non 420. + av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); + + av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm); + av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm); + for (plane = 1; plane < num_planes; ++plane) { + switch (path) { + case LF_PATH_420: + av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm); + av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm); + break; + case LF_PATH_444: + av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm); + av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm); + break; + case LF_PATH_SLOW: + av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + break; } } } } +#endif // CONFIG_PARALLEL_DEBLOCKING #endif // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES } diff --git a/av1/common/loopfilter.h b/av1/common/loopfilter.h index 975cbdf19..cdc251208 100644 --- a/av1/common/loopfilter.h +++ b/av1/common/loopfilter.h @@ -99,17 +99,26 @@ void av1_setup_mask(struct AV1Common *const cm, const int mi_row, const int mi_col, MODE_INFO **mi_8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm); -void av1_filter_block_plane_ss00(struct AV1Common *const cm, - struct macroblockd_plane *const plane, - int mi_row, LOOP_FILTER_MASK *lfm); - -void av1_filter_block_plane_ss11(struct AV1Common *const cm, - struct macroblockd_plane *const plane, - int mi_row, LOOP_FILTER_MASK *lfm); - -void av1_filter_block_plane_non420(struct AV1Common *cm, - struct macroblockd_plane *plane, - MODE_INFO **mi_8x8, int mi_row, int mi_col); +void av1_filter_block_plane_ss00_ver(struct AV1Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm); +void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm); +void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm); +void av1_filter_block_plane_ss11_hor(struct AV1Common *const cm, + struct macroblockd_plane *const plane, + int mi_row, LOOP_FILTER_MASK *lfm); + +void av1_filter_block_plane_non420_ver(struct AV1Common *cm, + struct macroblockd_plane *plane, + MODE_INFO **mi_8x8, int mi_row, + int mi_col); +void av1_filter_block_plane_non420_hor(struct AV1Common *cm, + struct macroblockd_plane *plane, + int mi_row); void av1_loop_filter_init(struct AV1Common *cm); diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c index eeaeb21fe..11006715b 100644 --- a/av1/common/thread_common.c +++ b/av1/common/thread_common.c @@ -85,25 +85,153 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c, #endif // CONFIG_MULTITHREAD } -// Implement row loopfiltering for each thread. -static INLINE void thread_loop_filter_rows( - const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm, - struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, - int y_only, AV1LfSync *const lf_sync) { - const int num_planes = y_only ? 1 : MAX_MB_PLANE; - const int sb_cols = mi_cols_aligned_to_sb(cm) >> cm->mib_size_log2; - int mi_row, mi_col; #if !CONFIG_EXT_PARTITION_TYPES - enum lf_path path; - LOOP_FILTER_MASK lfm; +static INLINE enum lf_path get_loop_filter_path( + int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) { if (y_only) - path = LF_PATH_444; + return LF_PATH_444; else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) - path = LF_PATH_420; + return LF_PATH_420; else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) - path = LF_PATH_444; + return LF_PATH_444; else - path = LF_PATH_SLOW; + return LF_PATH_SLOW; +} + +static INLINE void loop_filter_block_plane_ver( + AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane, + MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path, + LOOP_FILTER_MASK *lfm) { + if (plane == 0) { + av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, lfm); + } else { + switch (path) { + case LF_PATH_420: + av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_444: + av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_SLOW: + av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); + break; + } + } +} + +static INLINE void loop_filter_block_plane_hor( + AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane, + int mi_row, enum lf_path path, LOOP_FILTER_MASK *lfm) { + if (plane == 0) { + av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm); + } else { + switch (path) { + case LF_PATH_420: + av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_444: + av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm); + break; + case LF_PATH_SLOW: + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + break; + } + } +} +#endif +// Row-based multi-threaded loopfilter hook +#if CONFIG_PARALLEL_DEBLOCKING +static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync, + LFWorkerData *const lf_data) { + const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; + int mi_row, mi_col; +#if !CONFIG_EXT_PARTITION_TYPES + enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes); +#endif + for (mi_row = lf_data->start; mi_row < lf_data->stop; + mi_row += lf_sync->num_workers * lf_data->cm->mib_size) { + MODE_INFO **const mi = + lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride; + + for (mi_col = 0; mi_col < lf_data->cm->mi_cols; + mi_col += lf_data->cm->mib_size) { + LOOP_FILTER_MASK lfm; + int plane; + + av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row, + mi_col); + av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, + lf_data->cm->mi_stride, &lfm); + +#if CONFIG_EXT_PARTITION_TYPES + for (plane = 0; plane < num_planes; ++plane) + av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane], + mi + mi_col, mi_row, mi_col); +#else + + for (plane = 0; plane < num_planes; ++plane) + loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi, + mi_row, mi_col, path, &lfm); +#endif + } + } + return 1; +} + +static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync, + LFWorkerData *const lf_data) { + const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; + const int sb_cols = + mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2; + int mi_row, mi_col; +#if !CONFIG_EXT_PARTITION_TYPES + enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes); +#endif + + for (mi_row = lf_data->start; mi_row < lf_data->stop; + mi_row += lf_sync->num_workers * lf_data->cm->mib_size) { + MODE_INFO **const mi = + lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride; + + for (mi_col = 0; mi_col < lf_data->cm->mi_cols; + mi_col += lf_data->cm->mib_size) { + const int r = mi_row >> lf_data->cm->mib_size_log2; + const int c = mi_col >> lf_data->cm->mib_size_log2; + LOOP_FILTER_MASK lfm; + int plane; + + // TODO(wenhao.zhang@intel.com): For better parallelization, reorder + // the outer loop to column-based and remove the synchronizations here. + sync_read(lf_sync, r, c); + + av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row, + mi_col); + av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, + lf_data->cm->mi_stride, &lfm); +#if CONFIG_EXT_PARTITION_TYPES + for (plane = 0; plane < num_planes; ++plane) + av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane], + mi_row); +#else + for (plane = 0; plane < num_planes; ++plane) + loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row, + path, &lfm); +#endif + sync_write(lf_sync, r, c, sb_cols); + } + } + return 1; +} +#else // CONFIG_PARALLEL_DEBLOCKING +static int loop_filter_row_worker(AV1LfSync *const lf_sync, + LFWorkerData *const lf_data) { + const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE; + const int sb_cols = + mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2; + int mi_row, mi_col; +#if !CONFIG_EXT_PARTITION_TYPES + enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes); #endif // !CONFIG_EXT_PARTITION_TYPES #if CONFIG_EXT_PARTITION @@ -113,56 +241,48 @@ static INLINE void thread_loop_filter_rows( exit(EXIT_FAILURE); #endif // CONFIG_EXT_PARTITION - for (mi_row = start; mi_row < stop; - mi_row += lf_sync->num_workers * cm->mib_size) { - MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + for (mi_row = lf_data->start; mi_row < lf_data->stop; + mi_row += lf_sync->num_workers * lf_data->cm->mib_size) { + MODE_INFO **const mi = + lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride; - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) { - const int r = mi_row >> cm->mib_size_log2; - const int c = mi_col >> cm->mib_size_log2; + for (mi_col = 0; mi_col < lf_data->cm->mi_cols; + mi_col += lf_data->cm->mib_size) { + const int r = mi_row >> lf_data->cm->mib_size_log2; + const int c = mi_col >> lf_data->cm->mib_size_log2; +#if !CONFIG_EXT_PARTITION_TYPES + LOOP_FILTER_MASK lfm; +#endif int plane; sync_read(lf_sync, r, c); - av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col); - + av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row, + mi_col); #if CONFIG_EXT_PARTITION_TYPES - for (plane = 0; plane < num_planes; ++plane) - av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, mi_row, - mi_col); + for (plane = 0; plane < num_planes; ++plane) { + av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane], + mi + mi_col, mi_row, mi_col); + av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane], + mi_row); + } #else - // TODO(JBB): Make setup_mask work for non 420. - av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm); - - av1_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm); - for (plane = 1; plane < num_planes; ++plane) { - switch (path) { - case LF_PATH_420: - av1_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm); - break; - case LF_PATH_444: - av1_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm); - break; - case LF_PATH_SLOW: - av1_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, - mi_row, mi_col); - break; - } + av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, + lf_data->cm->mi_stride, &lfm); + + for (plane = 0; plane < num_planes; ++plane) { + loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi, + mi_row, mi_col, path, &lfm); + loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row, + path, &lfm); } #endif // CONFIG_EXT_PARTITION_TYPES sync_write(lf_sync, r, c, sb_cols); } } -} - -// Row-based multi-threaded loopfilter hook -static int loop_filter_row_worker(AV1LfSync *const lf_sync, - LFWorkerData *const lf_data) { - thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, - lf_data->start, lf_data->stop, lf_data->y_only, - lf_sync); return 1; } +#endif // CONFIG_PARALLEL_DEBLOCKING static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], @@ -191,17 +311,79 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } +// Set up loopfilter thread data. +// The decoder is capping num_workers because it has been observed that using +// more threads on the loopfilter than there are cores will hurt performance +// on Android. This is because the system will only schedule the tile decode +// workers on cores equal to the number of tile columns. Then if the decoder +// tries to use more threads for the loopfilter, it will hurt performance +// because of contention. If the multithreading code changes in the future +// then the number of workers used by the loopfilter should be revisited. + +#if CONFIG_PARALLEL_DEBLOCKING + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + // Filter all the vertical edges in the whole frame + for (i = 0; i < num_workers; ++i) { + AVxWorker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; + + worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker; + worker->data1 = lf_sync; + worker->data2 = lf_data; + + // Loopfilter data + av1_loop_filter_data_reset(lf_data, frame, cm, planes); + lf_data->start = start + i * cm->mib_size; + lf_data->stop = stop; + lf_data->y_only = y_only; + + // Start loopfiltering + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait till all rows are finished + for (i = 0; i < num_workers; ++i) { + winterface->sync(&workers[i]); + } + + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + // Filter all the horizontal edges in the whole frame + for (i = 0; i < num_workers; ++i) { + AVxWorker *const worker = &workers[i]; + LFWorkerData *const lf_data = &lf_sync->lfdata[i]; + + worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker; + worker->data1 = lf_sync; + worker->data2 = lf_data; + + // Loopfilter data + av1_loop_filter_data_reset(lf_data, frame, cm, planes); + lf_data->start = start + i * cm->mib_size; + lf_data->stop = stop; + lf_data->y_only = y_only; + + // Start loopfiltering + if (i == num_workers - 1) { + winterface->execute(worker); + } else { + winterface->launch(worker); + } + } + + // Wait till all rows are finished + for (i = 0; i < num_workers; ++i) { + winterface->sync(&workers[i]); + } +#else // CONFIG_PARALLEL_DEBLOCKING // Initialize cur_sb_col to -1 for all SB rows. memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); - // Set up loopfilter thread data. - // The decoder is capping num_workers because it has been observed that using - // more threads on the loopfilter than there are cores will hurt performance - // on Android. This is because the system will only schedule the tile decode - // workers on cores equal to the number of tile columns. Then if the decoder - // tries to use more threads for the loopfilter, it will hurt performance - // because of contention. If the multithreading code changes in the future - // then the number of workers used by the loopfilter should be revisited. for (i = 0; i < num_workers; ++i) { AVxWorker *const worker = &workers[i]; LFWorkerData *const lf_data = &lf_sync->lfdata[i]; @@ -228,6 +410,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, for (i = 0; i < num_workers; ++i) { winterface->sync(&workers[i]); } +#endif // CONFIG_PARALLEL_DEBLOCKING } void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, diff --git a/configure b/configure index 5bfce0416..fa458f7ad 100755 --- a/configure +++ b/configure @@ -289,6 +289,7 @@ EXPERIMENT_LIST=" delta_q adapt_scan filter_7bit + parallel_deblocking " CONFIG_LIST=" dependency_tracking -- 2.49.0