From: Yi Luo Date: Fri, 3 Jun 2016 01:39:41 +0000 (-0700) Subject: Add horiz_filter.c X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=31a5738107c193d3d9e8ebaaeea9d4ff13975c33;p=libvpx Add horiz_filter.c - Saturation correct solution 3 --- diff --git a/convolve/Makefile b/convolve/Makefile index 5d53c0d23..d86bb7d29 100644 --- a/convolve/Makefile +++ b/convolve/Makefile @@ -1,9 +1,11 @@ -all : filtering +all : filtering horiz_filter CFLAGS += -mssse3 filtering : filtering.c +horiz_filter : horiz_filter.c .PHONY : clean clean : - $(RM) filtering + $(RM) filtering horiz_filter + diff --git a/convolve/horiz_filter.c b/convolve/horiz_filter.c new file mode 100644 index 000000000..7c5f503fb --- /dev/null +++ b/convolve/horiz_filter.c @@ -0,0 +1,304 @@ +#include +#include +#include +#include +#include +#include + +static inline unsigned int readtsc(void) { + unsigned int tsc; + __asm__ __volatile__("rdtsc\n\t":"=a"(tsc):); + return tsc; +} + +#define FILTER_BITS 7 + +unsigned int seed = 0xbeef; + +int round_power_of_two(int x, int n) { + int ret = (x + (1 << (n - 1))) >> n; + return ret; +} + +uint8_t inline clip_pixel(int x) { + uint8_t ret = x; + if (x < 0) { + ret = 0; + } + if (x > 255) { + ret = 255; + } + return ret; +} + +static int filtering(const uint8_t *src, const int16_t *filter, int flen) { + int k; + int sum = 0; + int prod; + for (k = 0; k < flen; ++k) { + prod = src[k] * filter[k]; + sum += prod; + } + return sum; +} + +void convolve(const uint8_t *src, int w, const int16_t *filter, int flen, + uint8_t *buffer) { + int i; + int sum; + + for (i = 0; i < w; ++i) { + sum = filtering(src, filter, flen); + buffer[i] = clip_pixel(round_power_of_two(sum, FILTER_BITS)); + src += 1; + } +} + +void init_state(uint8_t *buf, uint8_t *pixel, int w, int block) { + int i; + + memset(buf, 0, sizeof(buf[0]) * block); + memset(pixel, 0, sizeof(pixel[0]) * block); + + seed = 0xbeef; + for (i = 0; i < w; ++i) { + pixel[i] = clip_pixel(rand_r(&seed) % 255); + } +} + +void check_buffer(const uint8_t *buf1, const uint8_t *buf2, int width) { + int i; + for (i = 0; i < width; ++i) { + if (buf1[i] != buf2[i]) { + printf("Not bit-exact on index %d\n", i); + printf("Expected: 0x%x, Actual: 0x%x\n", buf1[i], buf2[i]); + return; + } + } +} + +static const int16_t filter12[12] = { + -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1}; + +static const int16_t filter10[10] = { + 1, -3, 7, -17, 119, 28, -11, 5, -2, 1}; + +// SSSE3 + +const int8_t pfilter12[3][16] __attribute__ ((aligned(16))) = { + {-1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0, 0}, + //{ 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0}, + { 0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0}, + //{ 0, 0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0}, + { 0, 0, 0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1}, +}; + +const int8_t pfilter10[3][16] __attribute__ ((aligned(16))) = { + //{1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0, 0}, + {0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0}, + //{0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0}, + {0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0}, + //{0, 0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0}, + {0, 0, 0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0}, + //{0, 0, 0, 0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1}, +}; + +struct Filter { + const int8_t (*coeffs)[16]; + int tapsNum; + int signalSpan; +}; + +const struct Filter pfilter_12tap = { + pfilter12, 12, 5 +}; + +const struct Filter pfilter_10tap = { + pfilter10, 10, 7 +}; + +void inline transpose_4x8(const __m128i *in, __m128i *out) { + __m128i t0, t1; + + t0 = _mm_unpacklo_epi16(in[0], in[1]); + t1 = _mm_unpacklo_epi16(in[2], in[3]); + + out[0] = _mm_unpacklo_epi32(t0, t1); + out[1] = _mm_srli_si128(out[0], 8); + out[2] = _mm_unpackhi_epi32(t0, t1); + out[3] = _mm_srli_si128(out[2], 8); + + t0 = _mm_unpackhi_epi16(in[0], in[1]); + t1 = _mm_unpackhi_epi16(in[2], in[3]); + + out[4] = _mm_unpacklo_epi32(t0, t1); + out[5] = _mm_srli_si128(out[4], 8); + out[6] = _mm_unpackhi_epi32(t0, t1); // 0? + out[7] = _mm_srli_si128(out[6], 8); // 0? +} + +void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, + uint8_t *buffer) { + __m128i sr[4]; + __m128i sc[8]; + __m128i pixel; + const __m128i k_256 = _mm_set1_epi16(1 << 8); + + pixel = _mm_loadu_si128((__m128i const *)src); + sr[0] = _mm_maddubs_epi16(pixel, f[0]); + sr[2] = _mm_maddubs_epi16(pixel, f[1]); + + pixel = _mm_loadu_si128((__m128i const *)(src + 1)); + sr[1] = _mm_maddubs_epi16(pixel, f[0]); + sr[3] = _mm_maddubs_epi16(pixel, f[1]); + + transpose_4x8(sr, sc); + + sr[0] = _mm_adds_epi16(sc[0], sc[1]); + sr[0] = _mm_adds_epi16(sr[0], sc[2]); + + sr[1] = _mm_adds_epi16(sc[7], sc[6]); + sr[1] = _mm_adds_epi16(sr[1], sc[5]); + + sr[2] = _mm_min_epi16(sc[3], sc[4]); + sr[3] = _mm_max_epi16(sc[3], sc[4]); + + sr[0] = _mm_adds_epi16(sr[0], sr[1]); + sr[0] = _mm_adds_epi16(sr[0], sr[2]); + sr[0] = _mm_adds_epi16(sr[0], sr[3]); + + sr[1] = _mm_mulhrs_epi16(sr[0], k_256); + sr[2] = _mm_packus_epi16(sr[1], sr[1]); + + *(int *)buffer = _mm_cvtsi128_si32(sr[2]); +} + +void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) { +horiz_w4_ssse3(src, f, buf); +src += 4; +buf += 4; +horiz_w4_ssse3(src, f, buf); +} + +void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) { +horiz_w8_ssse3(src, f, buf); +src += 8; +buf += 8; +horiz_w8_ssse3(src, f, buf); +} + +void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) { +horiz_w16_ssse3(src, f, buf); +src += 16; +buf += 16; +horiz_w16_ssse3(src, f, buf); +} + +void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) { +horiz_w32_ssse3(src, f, buf); +src += 32; +buf += 32; +horiz_w32_ssse3(src, f, buf); +} + +void (*horizTab[5])(const uint8_t *, const __m128i *, uint8_t *) = { + horiz_w4_ssse3, + horiz_w8_ssse3, + horiz_w16_ssse3, + horiz_w32_ssse3, + horiz_w64_ssse3, +}; + +void horiz_filter_ssse3(const uint8_t *src, const struct Filter fData, + int width, uint8_t *buffer) { + const int16_t *filter = (const int16_t *) fData.coeffs; + __m128i f[7]; + + if (fData.tapsNum == 12) { + f[0] = *((__m128i *)(fData.coeffs)); + f[1] = *((__m128i *)(fData.coeffs + 1)); + f[2] = *((__m128i *)(fData.coeffs + 2)); + f[3] = *((__m128i *)(fData.coeffs + 3)); + } else { + f[0] = *((__m128i *)(fData.coeffs)); + f[1] = *((__m128i *)(fData.coeffs + 1)); + f[2] = *((__m128i *)(fData.coeffs + 2)); + f[3] = *((__m128i *)(fData.coeffs + 3)); + f[4] = *((__m128i *)(fData.coeffs + 4)); + f[5] = *((__m128i *)(fData.coeffs + 5)); + f[6] = *((__m128i *)(fData.coeffs + 6)); + } + + switch (width) { + case 4: + horizTab[0](src, f, buffer); + break; + case 8: + horizTab[1](src, f, buffer); + break; + case 16: + horizTab[2](src, f, buffer); + break; + case 32: + horizTab[3](src, f, buffer); + break; + case 64: + horizTab[4](src, f, buffer); + break; + default: + assert(0); + } +} + + +#define TEST_NUM (32) + +int main(int argc, char **argv) +{ + const size_t block_size = 256; + + if (argc != 2) { + printf("Usage: filtering , where width = 4, 8, 16, 32, 64\n"); + return -1; + } + + const int width = atoi(argv[1]); + + uint8_t *buffer = (uint8_t *) malloc(2 * sizeof(buffer[0]) * block_size); + uint8_t *pixel = (uint8_t *) malloc(2 * sizeof(pixel[0]) * block_size); + uint8_t *ppixel = pixel + block_size; + uint8_t *pbuffer = buffer + block_size; + + uint32_t start, end; + int count; + + init_state(buffer, pixel, width, block_size); + init_state(pbuffer, ppixel, width, block_size); + + count = 0; + start = readtsc(); + do { + convolve(pixel, width, filter12, 12, buffer); + count++; + } while (count < TEST_NUM); + end = readtsc(); + + printf("C version cycles: %d\n", end - start); + + // Solution 1 + count = 0; + start = readtsc(); + do { + horiz_filter_ssse3(ppixel, pfilter_12tap, width, pbuffer); + count++; + } while (count < TEST_NUM); + end = readtsc(); + + printf("SIMD version cycles: %d\n", end - start); + + check_buffer(buffer, pbuffer, width); + + free(buffer); + free(pixel); + return 0; +}