vpx_usec_timer timer;
vpx_usec_timer_start(&timer);
- for (int i = 0; i < 10000000 / (width * height); ++i) {
+ for (int i = 0; i < 100000000 / (width * height); ++i) {
avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
(uint8_t *)pred.TopLeftPixel(), width, height,
(uint8_t *)ref.TopLeftPixel(), ref.stride());
C, AvgPredTestHBD,
::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+ SSE2, AvgPredTestHBD,
+ ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif // HAVE_SSE2
+
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <emmintrin.h> // SSE2
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#undef FNS
#undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+ int width, int height, const uint16_t *ref,
+ int ref_stride) {
+ int i, j;
+ if (width > 8) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; j += 16) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+ _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else if (width == 8) {
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+ const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+ _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+ comp_pred += 8 << 1;
+ pred += 8 << 1;
+ ref += ref_stride << 1;
+ }
+ } else {
+ assert(width == 4);
+ for (i = 0; i < height; i += 2) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+ const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+ const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+ const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+ _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+ _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+ comp_pred += 4 << 1;
+ pred += 4 << 1;
+ ref += ref_stride << 1;
+ }
+ }
+}