From: Yi Luo <luoyi@google.com>
Date: Fri, 3 Jun 2016 01:39:41 +0000 (-0700)
Subject: Add horiz_filter.c
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=31a5738107c193d3d9e8ebaaeea9d4ff13975c33;p=libvpx

Add horiz_filter.c

- Saturation correct solution 3
---

diff --git a/convolve/Makefile b/convolve/Makefile
index 5d53c0d23..d86bb7d29 100644
--- a/convolve/Makefile
+++ b/convolve/Makefile
@@ -1,9 +1,11 @@
-all : filtering
+all : filtering horiz_filter
 
 CFLAGS += -mssse3
 
 filtering : filtering.c
+horiz_filter : horiz_filter.c
 
 .PHONY : clean
 clean :
-	$(RM) filtering
+	$(RM) filtering horiz_filter
+
diff --git a/convolve/horiz_filter.c b/convolve/horiz_filter.c
new file mode 100644
index 000000000..7c5f503fb
--- /dev/null
+++ b/convolve/horiz_filter.c
@@ -0,0 +1,304 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+static inline unsigned int readtsc(void) {
+  unsigned int tsc;
+  __asm__ __volatile__("rdtsc\n\t":"=a"(tsc):);
+  return tsc;
+}
+
+#define FILTER_BITS 7
+
+unsigned int seed = 0xbeef;
+
+int round_power_of_two(int x, int n) {
+  int ret = (x + (1 << (n - 1))) >> n;
+  return ret;
+}
+
+uint8_t inline clip_pixel(int x) {
+  uint8_t ret = x;
+  if (x < 0) {
+    ret = 0;
+  }
+  if (x > 255) {
+    ret = 255;
+  }
+  return ret;
+}
+
+static int filtering(const uint8_t *src, const int16_t *filter, int flen) {
+  int k;
+  int sum = 0;
+  int prod;
+  for (k = 0; k < flen; ++k) {
+    prod = src[k] * filter[k];
+    sum += prod;
+  }
+  return sum;
+}
+
+void convolve(const uint8_t *src, int w, const int16_t *filter, int flen,
+                uint8_t *buffer) {
+  int i;
+  int sum;
+
+  for (i = 0; i < w; ++i) {
+    sum = filtering(src, filter, flen);
+    buffer[i] = clip_pixel(round_power_of_two(sum, FILTER_BITS));
+    src += 1;
+  }
+}
+
+void init_state(uint8_t *buf, uint8_t *pixel, int w, int block) {
+  int i;
+
+  memset(buf, 0, sizeof(buf[0]) * block);
+  memset(pixel, 0, sizeof(pixel[0]) * block);
+
+  seed = 0xbeef;
+  for (i = 0; i < w; ++i) {
+    pixel[i] = clip_pixel(rand_r(&seed) % 255);
+  }
+}
+
+void check_buffer(const uint8_t *buf1, const uint8_t *buf2, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    if (buf1[i] != buf2[i]) {
+      printf("Not bit-exact on index %d\n", i);
+      printf("Expected: 0x%x, Actual: 0x%x\n", buf1[i], buf2[i]);
+      return;
+    }
+  }
+}
+
+static const int16_t filter12[12] = {
+  -1,   3,  -4,   8, -18, 120,  28, -12,   7,  -4,   2, -1};
+
+static const int16_t filter10[10] = {
+  1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1};
+
+// SSSE3
+
+const int8_t pfilter12[3][16] __attribute__ ((aligned(16))) = {
+  {-1,  3, -4,  8, -18, 120,  28, -12,   7,  -4,   2,  -1,  0,  0,  0,  0},
+  //{ 0, -1,  3, -4,   8, -18, 120,  28, -12,   7,  -4,   2, -1,  0,  0,  0},
+  { 0,  0, -1,  3,  -4,   8, -18, 120,  28, -12,   7,  -4,  2, -1,  0,  0},
+  //{ 0,  0,  0, -1,   3,  -4,   8, -18, 120,  28, -12,   7, -4,  2, -1,  0},
+  { 0,  0,  0,  0,  -1,   3,  -4,   8, -18, 120,  28, -12,  7, -4,  2, -1},
+};
+
+const int8_t pfilter10[3][16] __attribute__ ((aligned(16))) = {
+  //{1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0, 0, 0, 0, 0, 0},
+  {0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0, 0, 0, 0, 0},
+  //{0, 0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0, 0, 0, 0},
+  {0, 0, 0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0, 0, 0},
+  //{0, 0, 0, 0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0, 0},
+  {0, 0, 0, 0, 0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1, 0},
+  //{0, 0, 0, 0, 0, 0, 1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1},
+};
+
+struct Filter {
+  const int8_t (*coeffs)[16];
+  int tapsNum;
+  int signalSpan;
+};
+
+const struct Filter pfilter_12tap = {
+  pfilter12, 12, 5
+};
+
+const struct Filter pfilter_10tap = {
+  pfilter10, 10, 7
+};
+
+void inline transpose_4x8(const __m128i *in, __m128i *out) {
+  __m128i t0, t1;
+
+  t0 = _mm_unpacklo_epi16(in[0], in[1]);
+  t1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi32(t0, t1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(t0, t1);
+  out[3] = _mm_srli_si128(out[2], 8);
+
+  t0 = _mm_unpackhi_epi16(in[0], in[1]);
+  t1 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  out[4] = _mm_unpacklo_epi32(t0, t1);
+  out[5] = _mm_srli_si128(out[4], 8);
+  out[6] = _mm_unpackhi_epi32(t0, t1);  // 0?
+  out[7] = _mm_srli_si128(out[6], 8);   // 0?
+}
+
+void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
+                    uint8_t *buffer) {
+  __m128i sr[4];
+  __m128i sc[8];
+  __m128i pixel;
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+
+  pixel = _mm_loadu_si128((__m128i const *)src);
+  sr[0] = _mm_maddubs_epi16(pixel, f[0]);
+  sr[2] = _mm_maddubs_epi16(pixel, f[1]);
+
+  pixel = _mm_loadu_si128((__m128i const *)(src + 1));
+  sr[1] = _mm_maddubs_epi16(pixel, f[0]);
+  sr[3] = _mm_maddubs_epi16(pixel, f[1]);
+
+  transpose_4x8(sr, sc);
+
+  sr[0] = _mm_adds_epi16(sc[0], sc[1]);
+  sr[0] = _mm_adds_epi16(sr[0], sc[2]);
+
+  sr[1] = _mm_adds_epi16(sc[7], sc[6]);
+  sr[1] = _mm_adds_epi16(sr[1], sc[5]);
+
+  sr[2] = _mm_min_epi16(sc[3], sc[4]);
+  sr[3] = _mm_max_epi16(sc[3], sc[4]);
+
+  sr[0] = _mm_adds_epi16(sr[0], sr[1]);
+  sr[0] = _mm_adds_epi16(sr[0], sr[2]);
+  sr[0] = _mm_adds_epi16(sr[0], sr[3]);
+
+  sr[1] = _mm_mulhrs_epi16(sr[0], k_256);
+  sr[2] = _mm_packus_epi16(sr[1], sr[1]);
+
+  *(int *)buffer = _mm_cvtsi128_si32(sr[2]);
+}
+
+void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) {
+horiz_w4_ssse3(src, f, buf);
+src += 4;
+buf += 4;
+horiz_w4_ssse3(src, f, buf);
+}
+
+void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) {
+horiz_w8_ssse3(src, f, buf);
+src += 8;
+buf += 8;
+horiz_w8_ssse3(src, f, buf);
+}
+
+void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) {
+horiz_w16_ssse3(src, f, buf);
+src += 16;
+buf += 16;
+horiz_w16_ssse3(src, f, buf);
+}
+
+void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, uint8_t *buf) {
+horiz_w32_ssse3(src, f, buf);
+src += 32;
+buf += 32;
+horiz_w32_ssse3(src, f, buf);
+}
+
+void (*horizTab[5])(const uint8_t *, const __m128i *, uint8_t *) = {
+   horiz_w4_ssse3,
+   horiz_w8_ssse3,
+   horiz_w16_ssse3,
+   horiz_w32_ssse3,
+   horiz_w64_ssse3,
+};
+
+void horiz_filter_ssse3(const uint8_t *src, const struct Filter fData,
+                        int width, uint8_t *buffer) {
+  const int16_t *filter = (const int16_t *) fData.coeffs;
+  __m128i f[7];
+
+  if (fData.tapsNum == 12) {
+    f[0] = *((__m128i *)(fData.coeffs));
+    f[1] = *((__m128i *)(fData.coeffs + 1));
+    f[2] = *((__m128i *)(fData.coeffs + 2));
+    f[3] = *((__m128i *)(fData.coeffs + 3));
+  } else {
+    f[0] = *((__m128i *)(fData.coeffs));
+    f[1] = *((__m128i *)(fData.coeffs + 1));
+    f[2] = *((__m128i *)(fData.coeffs + 2));
+    f[3] = *((__m128i *)(fData.coeffs + 3));
+    f[4] = *((__m128i *)(fData.coeffs + 4));
+    f[5] = *((__m128i *)(fData.coeffs + 5));
+    f[6] = *((__m128i *)(fData.coeffs + 6));
+  }
+
+  switch (width) {
+    case 4:
+      horizTab[0](src, f, buffer);
+      break;
+    case 8:
+      horizTab[1](src, f, buffer);
+      break;
+    case 16:
+      horizTab[2](src, f, buffer);
+      break;
+    case 32:
+      horizTab[3](src, f, buffer);
+      break;
+    case 64:
+      horizTab[4](src, f, buffer);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+
+#define TEST_NUM (32)
+
+int main(int argc, char **argv)
+{
+  const size_t block_size = 256;
+
+  if (argc != 2) {
+    printf("Usage: filtering <width>, where width = 4, 8, 16, 32, 64\n");
+    return -1;
+  }
+
+  const int width = atoi(argv[1]);
+
+  uint8_t *buffer = (uint8_t *) malloc(2 * sizeof(buffer[0]) * block_size);
+  uint8_t *pixel = (uint8_t *) malloc(2 * sizeof(pixel[0]) * block_size);
+  uint8_t *ppixel = pixel + block_size;
+  uint8_t *pbuffer = buffer + block_size;
+
+  uint32_t start, end;
+  int count;
+
+  init_state(buffer, pixel, width, block_size);
+  init_state(pbuffer, ppixel, width, block_size);
+
+  count = 0;
+  start = readtsc();
+  do {
+    convolve(pixel, width, filter12, 12, buffer);
+    count++;
+  } while (count < TEST_NUM);
+  end = readtsc();
+
+  printf("C version cycles: %d\n", end - start);
+
+  // Solution 1
+  count = 0;
+  start = readtsc();
+  do {
+    horiz_filter_ssse3(ppixel, pfilter_12tap, width, pbuffer);
+    count++;
+  } while (count < TEST_NUM);
+  end = readtsc();
+
+  printf("SIMD version cycles: %d\n", end - start);
+
+  check_buffer(buffer, pbuffer, width);
+
+  free(buffer);
+  free(pixel);
+  return 0;
+}