From: Jim Bankoski <jimbankoski@google.com>
Date: Tue, 14 Jul 2015 16:19:01 +0000 (-0700)
Subject: Fill buffer speed up
X-Git-Tag: v1.5.0~412
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0fe589f21ea75db7fbee2a84749d24b5666ad232;p=libvpx

Fill buffer speed up

Eliminates the byte by byte read from bool decoder,  by reading
in a size_t and then shifting it into place.

Change-Id: I0ed8c7b6f942847e79cc90105dc1d2b5b3deb0d6
---

diff --git a/vpx_dsp/bitreader.c b/vpx_dsp/bitreader.c
index 3eae922ca..4420fadee 100644
--- a/vpx_dsp/bitreader.c
+++ b/vpx_dsp/bitreader.c
@@ -7,11 +7,15 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "./bitreader.h"
-#include "./prob.h"
+#include <stdlib.h>
 
+#include "./vpx_config.h"
+
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/prob.h"
 #include "vpx_ports/mem.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_util/endian_inl.h"
 
 int vpx_reader_init(vpx_reader *r,
                     const uint8_t *buffer,
@@ -39,11 +43,9 @@ void vpx_reader_fill(vpx_reader *r) {
   const uint8_t *buffer_start = buffer;
   BD_VALUE value = r->value;
   int count = r->count;
-  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
-  int loop_end = 0;
   const size_t bytes_left = buffer_end - buffer;
   const size_t bits_left = bytes_left * CHAR_BIT;
-  const int x = (int)(shift + CHAR_BIT - bits_left);
+  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
 
   if (r->decrypt_cb) {
     size_t n = MIN(sizeof(r->clear_buffer), bytes_left);
@@ -51,17 +53,34 @@ void vpx_reader_fill(vpx_reader *r) {
     buffer = r->clear_buffer;
     buffer_start = r->clear_buffer;
   }
+  if (bits_left > BD_VALUE_SIZE) {
+      const int bits = (shift & 0xfffffff8) + CHAR_BIT;
+      BD_VALUE nv;
+      BD_VALUE big_endian_values;
+      memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
+#if SIZE_MAX == 0xffffffffffffffffULL
+        big_endian_values = HToBE64(big_endian_values);
+#else
+        big_endian_values = HToBE32(big_endian_values);
+#endif
+      nv = big_endian_values >> (BD_VALUE_SIZE - bits);
+      count += bits;
+      buffer += (bits >> 3);
+      value = r->value | (nv << (shift & 0x7));
+  } else {
+    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    int loop_end = 0;
+    if (bits_over >= 0) {
+      count += LOTS_OF_BITS;
+      loop_end = bits_over;
+    }
 
-  if (x >= 0) {
-    count += LOTS_OF_BITS;
-    loop_end = x;
-  }
-
-  if (x < 0 || bits_left) {
-    while (shift >= loop_end) {
-      count += CHAR_BIT;
-      value |= (BD_VALUE)*buffer++ << shift;
-      shift -= CHAR_BIT;
+    if (bits_over < 0 || bits_left) {
+      while (shift >= loop_end) {
+        count += CHAR_BIT;
+        value |= (BD_VALUE)*buffer++ << shift;
+        shift -= CHAR_BIT;
+      }
     }
   }
 
diff --git a/vpx_util/endian_inl.h b/vpx_util/endian_inl.h
new file mode 100644
index 000000000..12cc720a4
--- /dev/null
+++ b/vpx_util/endian_inl.h
@@ -0,0 +1,117 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Endian related functions.
+
+#ifndef VPX_UTIL_ENDIAN_INL_H_
+#define VPX_UTIL_ENDIAN_INL_H_
+
+#include <stdlib.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if defined(__GNUC__)
+# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
+# define LOCAL_GCC_PREREQ(maj, min) \
+    (LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_GCC_VERSION 0
+# define LOCAL_GCC_PREREQ(maj, min) 0
+#endif
+
+#ifdef __clang__
+# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
+# define LOCAL_CLANG_PREREQ(maj, min) \
+    (LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
+#else
+# define LOCAL_CLANG_VERSION 0
+# define LOCAL_CLANG_PREREQ(maj, min) 0
+#endif  // __clang__
+
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
+#if defined(WORDS_BIGENDIAN)
+#define HToLE32 BSwap32
+#define HToLE16 BSwap16
+#define HToBE64(x) (x)
+#define HToBE32(x) (x)
+#else
+#define HToLE32(x) (x)
+#define HToLE16(x) (x)
+#define HToBE64(X) BSwap64(X)
+#define HToBE32(X) BSwap32(X)
+#endif
+
+// clang-3.3 and gcc-4.3 have builtin functions for swap32/swap64
+#if LOCAL_GCC_PREREQ(4, 3) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP32
+#define HAVE_BUILTIN_BSWAP64
+#endif
+// clang-3.3 and gcc-4.8 have a builtin function for swap16
+#if LOCAL_GCC_PREREQ(4, 8) || LOCAL_CLANG_PREREQ(3, 3)
+#define HAVE_BUILTIN_BSWAP16
+#endif
+
+static INLINE uint16_t BSwap16(uint16_t x) {
+#if defined(HAVE_BUILTIN_BSWAP16)
+  return __builtin_bswap16(x);
+#elif defined(_MSC_VER)
+  return _byteswap_ushort(x);
+#else
+  // gcc will recognize a 'rorw $8, ...' here:
+  return (x >> 8) | ((x & 0xff) << 8);
+#endif  // HAVE_BUILTIN_BSWAP16
+}
+
+static INLINE uint32_t BSwap32(uint32_t x) {
+#if HAVE_MIPS32
+  uint32_t ret;
+  __asm__ volatile (
+    "wsbh   %[ret], %[x]          \n\t"
+    "rotr   %[ret], %[ret],  16   \n\t"
+    : [ret]"=r"(ret)
+    : [x]"r"(x)
+  );
+  return ret;
+#elif defined(HAVE_BUILTIN_BSWAP32)
+  return __builtin_bswap32(x);
+#elif defined(__i386__) || defined(__x86_64__)
+  uint32_t swapped_bytes;
+  __asm__ volatile("bswap %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint32_t)_byteswap_ulong(x);
+#else
+  return (x >> 24) | ((x >> 8) & 0xff00) | ((x << 8) & 0xff0000) | (x << 24);
+#endif  // HAVE_BUILTIN_BSWAP32
+}
+
+static INLINE uint64_t BSwap64(uint64_t x) {
+#if defined(HAVE_BUILTIN_BSWAP64)
+  return __builtin_bswap64(x);
+#elif defined(__x86_64__)
+  uint64_t swapped_bytes;
+  __asm__ volatile("bswapq %0" : "=r"(swapped_bytes) : "0"(x));
+  return swapped_bytes;
+#elif defined(_MSC_VER)
+  return (uint64_t)_byteswap_uint64(x);
+#else  // generic code for swapping 64-bit values (suggested by bdb@)
+  x = ((x & 0xffffffff00000000ull) >> 32) | ((x & 0x00000000ffffffffull) << 32);
+  x = ((x & 0xffff0000ffff0000ull) >> 16) | ((x & 0x0000ffff0000ffffull) << 16);
+  x = ((x & 0xff00ff00ff00ff00ull) >>  8) | ((x & 0x00ff00ff00ff00ffull) <<  8);
+  return x;
+#endif  // HAVE_BUILTIN_BSWAP64
+}
+
+#endif  // VPX_UTIL_ENDIAN_INL_H_
diff --git a/vpx_util/vpx_util.mk b/vpx_util/vpx_util.mk
index 116112548..c0ef8d336 100644
--- a/vpx_util/vpx_util.mk
+++ b/vpx_util/vpx_util.mk
@@ -11,3 +11,4 @@
 UTIL_SRCS-yes += vpx_util.mk
 UTIL_SRCS-yes += vpx_thread.c
 UTIL_SRCS-yes += vpx_thread.h
+UTIL_SRCS-yes += endian_inl.h