From 44ee77790516969af6d2098dab0fd39c8e23dcb4 Mon Sep 17 00:00:00 2001 From: Frank Galligan Date: Fri, 12 Dec 2014 11:34:24 -0800 Subject: [PATCH] Remove redundant loads on 1d8_v8 filter. This CL showed a modest gain in performance on some systems. Change-Id: Iad636a89a1a9804ab7a0dea302bf2c6a4d1653a4 --- vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c index c4efa6565..bcae6c18b 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c @@ -312,9 +312,11 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, unsigned int out_pitch, unsigned int output_height, int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; + __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 @@ -333,27 +335,26 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); + srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]); + srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]); + srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]); + srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]); + srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]); + srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]); + for (i = 0; i < output_height; i++) { - // load the first 8 bytes - srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); - srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]); // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); - srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); - srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); - srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); @@ -377,6 +378,15 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, src_ptr+=src_pitch; + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + // save only 8 bytes convolve result _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); -- 2.40.0