From 6fd7dd1a703b922a5f200c4e1962be5b81c73af0 Mon Sep 17 00:00:00 2001 From: John Koleszar Date: Wed, 20 Feb 2013 15:59:20 -0800 Subject: [PATCH] Use 256-byte aligned filter tables This avoids duplicating all the filters twice. Includes fixups to the convolve routines and associated tests to make this work. Change-Id: I922f86021594e55072ddb63b42b2313605db6e00 --- test/convolve_test.cc | 52 ++++++++++---------- vp9/common/vp9_convolve.c | 26 ++++++---- vp9/common/vp9_filter.c | 97 +++---------------------------------- vp9/common/vp9_filter.h | 10 ++-- vp9/common/vp9_reconinter.c | 1 + 5 files changed, 57 insertions(+), 129 deletions(-) diff --git a/test/convolve_test.cc b/test/convolve_test.cc index efb87acb7..83e0e3cce 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -13,6 +13,7 @@ extern "C" { #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vpx_mem/vpx_mem.h" +#include "vpx_ports/mem.h" } #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/acm_random.h" @@ -430,19 +431,7 @@ TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) { } } -TEST_P(ConvolveTest, ChangeFilterWorks) { - uint8_t* const in = input(); - uint8_t* const out = output(); - - const int16_t filters[][8] = { - { 0, 0, 0, 0, 0, 0, 0, 128}, - { 0, 0, 0, 0, 0, 0, 128}, - { 0, 0, 0, 0, 0, 128}, - { 0, 0, 0, 0, 128}, - { 0, 0, 0, 128}, - { 0, 0, 128}, - { 0, 128}, - { 128}, +DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = { { 0, 0, 0, 0, 0, 0, 0, 128}, { 0, 0, 0, 0, 0, 0, 128}, { 0, 0, 0, 0, 0, 128}, @@ -458,32 +447,45 @@ TEST_P(ConvolveTest, ChangeFilterWorks) { { 0, 0, 0, 128}, { 0, 0, 128}, { 0, 128}, - { 128}, - }; + { 128} +}; + +TEST_P(ConvolveTest, ChangeFilterWorks) { + uint8_t* const in = input(); + uint8_t* const out = output(); REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride, - filters[0], 17, filters[4], 16, + kChangeFilters[8], 17, kChangeFilters[4], 16, Width(), Height())); - for (int x = 0; x < (Width() > 4 ? 8 : 4); ++x) { - ASSERT_EQ(in[4], out[x]) << "x == " << x; + for (int x = 0; x < Width(); ++x) { + if (x < 8) + ASSERT_EQ(in[4], out[x]) << "x == " << x; + else + ASSERT_EQ(in[12], out[x]) << "x == " << x; } REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride, - filters[4], 16, filters[0], 17, + kChangeFilters[4], 16, kChangeFilters[8], 17, Width(), Height())); - for (int y = 0; y < (Height() > 4 ? 8 : 4); ++y) { - ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y; + for (int y = 0; y < Height(); ++y) { + if (y < 8) + ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y; + else + ASSERT_EQ(in[12 * kInputStride], out[y * kOutputStride]) << "y == " << y; } REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride, - filters[0], 17, filters[0], 17, + kChangeFilters[8], 17, kChangeFilters[8], 17, Width(), Height())); - for (int y = 0; y < (Height() > 4 ? 8 : 4); ++y) { - for (int x = 0; x < (Width() > 4 ? 8 : 4); ++x) { - ASSERT_EQ(in[4 * kInputStride + 4], out[y * kOutputStride + x]) + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const int ref_x = x < 8 ? 4 : 12; + const int ref_y = y < 8 ? 4 : 12; + + ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x]) << "x == " << x << ", y == " << y; } } diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index f1b5915bd..b062e7dc7 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -19,7 +19,6 @@ #define VP9_FILTER_WEIGHT 128 #define VP9_FILTER_SHIFT 7 -#define ALIGN_FILTERS_256 0 /* Assume a bank of 16 filters to choose from. There are two implementations * for filter wrapping behavior, since we want to be able to pick which filter @@ -34,8 +33,11 @@ * always 256 byte aligned. * * Implementations 2 and 3 are likely preferable, as they avoid an extra 2 - * parameters, and switching between them is trivial. + * parameters, and switching between them is trivial, with the + * ALIGN_FILTERS_256 macro, below. */ + #define ALIGN_FILTERS_256 1 + static void convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x0, int x_step_q4, @@ -56,11 +58,12 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride, const int16_t *filter_x = filter_x0; /* Initial phase offset */ - int x_q4 = (filter_x - filter_x_base) / taps; + int x0_q4 = (filter_x - filter_x_base) / taps; + int x_q4 = x0_q4; for (x = 0; x < w; ++x) { /* Per-pixel src offset */ - int src_x = x_q4 >> 4; + int src_x = (x_q4 - x0_q4) >> 4; for (sum = 0, k = 0; k < taps; ++k) { sum += src[src_x + k] * filter_x[k]; @@ -97,11 +100,12 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride, const int16_t *filter_x = filter_x0; /* Initial phase offset */ - int x_q4 = (filter_x - filter_x_base) / taps; + int x0_q4 = (filter_x - filter_x_base) / taps; + int x_q4 = x0_q4; for (x = 0; x < w; ++x) { /* Per-pixel src offset */ - int src_x = x_q4 >> 4; + int src_x = (x_q4 - x0_q4) >> 4; for (sum = 0, k = 0; k < taps; ++k) { sum += src[src_x + k] * filter_x[k]; @@ -138,11 +142,12 @@ static void convolve_vert_c(const uint8_t *src, int src_stride, const int16_t *filter_y = filter_y0; /* Initial phase offset */ - int y_q4 = (filter_y - filter_y_base) / taps; + int y0_q4 = (filter_y - filter_y_base) / taps; + int y_q4 = y0_q4; for (y = 0; y < h; ++y) { /* Per-pixel src offset */ - int src_y = y_q4 >> 4; + int src_y = (y_q4 - y0_q4) >> 4; for (sum = 0, k = 0; k < taps; ++k) { sum += src[(src_y + k) * src_stride] * filter_y[k]; @@ -179,11 +184,12 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride, const int16_t *filter_y = filter_y0; /* Initial phase offset */ - int y_q4 = (filter_y - filter_y_base) / taps; + int y0_q4 = (filter_y - filter_y_base) / taps; + int y_q4 = y0_q4; for (y = 0; y < h; ++y) { /* Per-pixel src offset */ - int src_y = y_q4 >> 4; + int src_y = (y_q4 - y0_q4) >> 4; for (sum = 0, k = 0; k < taps; ++k) { sum += src[(src_y + k) * src_stride] * filter_y[k]; diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index 9249c5786..434c63e7e 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -15,26 +15,7 @@ #include "vp9_rtcd.h" #include "vp9/common/vp9_common.h" -/* TODO(jkoleszar): We can avoid duplicating these tables 2X by forcing 256 - * byte alignment of the table's base address. - */ -DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS*2][8]) = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, - { 0, 0, 0, 120, 8, 0, 0, 0 }, - { 0, 0, 0, 112, 16, 0, 0, 0 }, - { 0, 0, 0, 104, 24, 0, 0, 0 }, - { 0, 0, 0, 96, 32, 0, 0, 0 }, - { 0, 0, 0, 88, 40, 0, 0, 0 }, - { 0, 0, 0, 80, 48, 0, 0, 0 }, - { 0, 0, 0, 72, 56, 0, 0, 0 }, - { 0, 0, 0, 64, 64, 0, 0, 0 }, - { 0, 0, 0, 56, 72, 0, 0, 0 }, - { 0, 0, 0, 48, 80, 0, 0, 0 }, - { 0, 0, 0, 40, 88, 0, 0, 0 }, - { 0, 0, 0, 32, 96, 0, 0, 0 }, - { 0, 0, 0, 24, 104, 0, 0, 0 }, - { 0, 0, 0, 16, 112, 0, 0, 0 }, - { 0, 0, 0, 8, 120, 0, 0, 0 }, +DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = { { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 }, { 0, 0, 0, 112, 16, 0, 0, 0 }, @@ -55,7 +36,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS*2][8]) = { #define FILTER_ALPHA 0 #define FILTER_ALPHA_SHARP 1 -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS*2][8]) +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { #if FILTER_ALPHA == 0 /* Lagrangian interpolation filter */ @@ -74,23 +55,8 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS*2][8]) { -1, 4, -11, 37, 112, -16, 4, -1}, { -1, 3, -9, 27, 118, -13, 4, -1}, { 0, 2, -6, 18, 122, -10, 3, -1}, - { 0, 1, -3, 8, 126, -5, 1, 0}, - { 0, 0, 0, 128, 0, 0, 0, 0}, - { 0, 1, -5, 126, 8, -3, 1, 0}, - { -1, 3, -10, 122, 18, -6, 2, 0}, - { -1, 4, -13, 118, 27, -9, 3, -1}, - { -1, 4, -16, 112, 37, -11, 4, -1}, - { -1, 5, -18, 105, 48, -14, 4, -1}, - { -1, 5, -19, 97, 58, -16, 5, -1}, - { -1, 6, -19, 88, 68, -18, 5, -1}, - { -1, 6, -19, 78, 78, -19, 6, -1}, - { -1, 5, -18, 68, 88, -19, 6, -1}, - { -1, 5, -16, 58, 97, -19, 5, -1}, - { -1, 4, -14, 48, 105, -18, 5, -1}, - { -1, 4, -11, 37, 112, -16, 4, -1}, - { -1, 3, -9, 27, 118, -13, 4, -1}, - { 0, 2, -6, 18, 122, -10, 3, -1}, { 0, 1, -3, 8, 126, -5, 1, 0} + #elif FILTER_ALPHA == 50 /* Generated using MATLAB: * alpha = 0.5; @@ -118,7 +84,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS*2][8]) #endif /* FILTER_ALPHA */ }; -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS*2][8]) +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = { #if FILTER_ALPHA_SHARP == 1 /* dct based filter */ @@ -137,23 +103,8 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS*2][8]) {-2, 6, -13, 37, 115, -20, 9, -4}, {-2, 5, -10, 27, 121, -17, 7, -3}, {-1, 3, -6, 17, 125, -13, 5, -2}, - {0, 1, -3, 8, 127, -7, 3, -1}, - {0, 0, 0, 128, 0, 0, 0, 0}, - {-1, 3, -7, 127, 8, -3, 1, 0}, - {-2, 5, -13, 125, 17, -6, 3, -1}, - {-3, 7, -17, 121, 27, -10, 5, -2}, - {-4, 9, -20, 115, 37, -13, 6, -2}, - {-4, 10, -23, 108, 48, -16, 8, -3}, - {-4, 10, -24, 100, 59, -19, 9, -3}, - {-4, 11, -24, 90, 70, -21, 10, -4}, - {-4, 11, -23, 80, 80, -23, 11, -4}, - {-4, 10, -21, 70, 90, -24, 11, -4}, - {-3, 9, -19, 59, 100, -24, 10, -4}, - {-3, 8, -16, 48, 108, -23, 10, -4}, - {-2, 6, -13, 37, 115, -20, 9, -4}, - {-2, 5, -10, 27, 121, -17, 7, -3}, - {-1, 3, -6, 17, 125, -13, 5, -2}, {0, 1, -3, 8, 127, -7, 3, -1} + #elif FILTER_ALPHA_SHARP == 75 /* alpha = 0.75 */ {0, 0, 0, 128, 0, 0, 0, 0}, @@ -175,8 +126,8 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS*2][8]) #endif /* FILTER_ALPHA_SHARP */ }; -DECLARE_ALIGNED(16, const int16_t, - vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS*2][8]) = { +DECLARE_ALIGNED(256, const int16_t, + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = { /* 8-tap lowpass filter */ /* Hamming window */ {-1, -7, 32, 80, 32, -7, -1, 0}, @@ -194,26 +145,10 @@ DECLARE_ALIGNED(16, const int16_t, { 1, -3, -4, 50, 76, 16, -8, 0}, { 1, -3, -5, 45, 78, 20, -8, 0}, { 1, -2, -7, 41, 79, 24, -8, 0}, - { 1, -2, -7, 37, 80, 28, -8, -1}, - {-1, -7, 32, 80, 32, -7, -1, 0}, - {-1, -8, 28, 80, 37, -7, -2, 1}, - { 0, -8, 24, 79, 41, -7, -2, 1}, - { 0, -8, 20, 78, 45, -5, -3, 1}, - { 0, -8, 16, 76, 50, -4, -3, 1}, - { 0, -7, 13, 74, 54, -3, -4, 1}, - { 1, -7, 9, 71, 58, -1, -4, 1}, - { 1, -6, 6, 68, 62, 1, -5, 1}, - { 1, -6, 4, 65, 65, 4, -6, 1}, - { 1, -5, 1, 62, 68, 6, -6, 1}, - { 1, -4, -1, 58, 71, 9, -7, 1}, - { 1, -4, -3, 54, 74, 13, -7, 0}, - { 1, -3, -4, 50, 76, 16, -8, 0}, - { 1, -3, -5, 45, 78, 20, -8, 0}, - { 1, -2, -7, 41, 79, 24, -8, 0}, { 1, -2, -7, 37, 80, 28, -8, -1} }; -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS*2][8]) +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) = { {0, 0, 0, 128, 0, 0, 0, 0}, {0, 1, -5, 125, 8, -2, 1, 0}, @@ -230,21 +165,5 @@ DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS*2][8]) {0, 2, -10, 37, 110, -14, 3, 0}, {0, 2, -8, 27, 116, -11, 2, 0}, {0, 1, -5, 17, 122, -8, 1, 0}, - {0, 1, -2, 8, 125, -5, 1, 0}, - {0, 0, 0, 128, 0, 0, 0, 0}, - {0, 1, -5, 125, 8, -2, 1, 0}, - {0, 1, -8, 122, 17, -5, 1, 0}, - {0, 2, -11, 116, 27, -8, 2, 0}, - {0, 3, -14, 110, 37, -10, 2, 0}, - {0, 3, -15, 103, 47, -12, 2, 0}, - {0, 3, -16, 95, 57, -14, 3, 0}, - {0, 3, -16, 86, 67, -15, 3, 0}, - {0, 3, -16, 77, 77, -16, 3, 0}, - {0, 3, -15, 67, 86, -16, 3, 0}, - {0, 3, -14, 57, 95, -16, 3, 0}, - {0, 2, -12, 47, 103, -15, 3, 0}, - {0, 2, -10, 37, 110, -14, 3, 0}, - {0, 2, -8, 27, 116, -11, 2, 0}, - {0, 1, -5, 17, 122, -8, 1, 0}, {0, 1, -2, 8, 125, -5, 1, 0} }; diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index 765379d35..1ccfdaac2 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -21,11 +21,11 @@ #define SUBPEL_SHIFTS 16 -extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS*2][8]; -extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS*2][8]; -extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS*2][8]; -extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS*2][8]; -extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS*2][8]; +extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]; // The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear // filter kernel as a 2 tap filter. diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 7be0101ac..30e8951af 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -128,6 +128,7 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd, break; #endif } + assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); } void vp9_copy_mem16x16_c(const uint8_t *src, -- 2.40.0