From: Johann Date: Wed, 24 Oct 2018 22:48:32 +0000 (-0700) Subject: vp8 bilinear: rewrite 16x16 X-Git-Tag: v1.8.0~205^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ad0ed535a702ebe4164c0bf5fd5f3211269f1fad;p=libvpx vp8 bilinear: rewrite 16x16 Marginally faster. Most importantly it drops a dependency on an external symbol (vp8_bilinear_filters_x86_8). Change-Id: Iff022e718720f1f0eeced6201a1ad69a9c9c4f45 --- diff --git a/vp8/common/x86/bilinear_filter_sse2.c b/vp8/common/x86/bilinear_filter_sse2.c index db91bdca4..224c1b32a 100644 --- a/vp8/common/x86/bilinear_filter_sse2.c +++ b/vp8/common/x86/bilinear_filter_sse2.c @@ -12,9 +12,133 @@ #include #include "./vp8_rtcd.h" +#include "./vpx_config.h" #include "vp8/common/filter.h" #include "vpx_ports/mem.h" +static INLINE void horizontal_16x16(uint8_t *src, const int stride, + uint16_t *dst, const int xoffset) { + int h; + const __m128i zero = _mm_setzero_si128(); + + if (xoffset == 0) { + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + _mm_store_si128((__m128i *)dst, a_lo); + _mm_store_si128((__m128i *)(dst + 8), a_hi); + src += stride; + dst += 16; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]); + const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]); + + for (h = 0; h < 17; ++h) { + const __m128i a = _mm_loadu_si128((__m128i *)src); + const __m128i a_lo = _mm_unpacklo_epi8(a, zero); + const __m128i a_hi = _mm_unpackhi_epi8(a, zero); + const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0); + const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0); + + const __m128i b = _mm_loadu_si128((__m128i *)(src + 1)); + const __m128i b_lo = _mm_unpacklo_epi8(b, zero); + const __m128i b_hi = _mm_unpackhi_epi8(b, zero); + const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1); + const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1); + + const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered); + const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + _mm_store_si128((__m128i *)dst, shifted_lo); + _mm_store_si128((__m128i *)(dst + 8), shifted_hi); + src += stride; + dst += 16; + } + } +} + +static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride, + const int yoffset) { + int h; + + if (yoffset == 0) { + for (h = 0; h < 16; ++h) { + const __m128i row_lo = _mm_load_si128((__m128i *)src); + const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i packed = _mm_packus_epi16(row_lo, row_hi); + _mm_store_si128((__m128i *)dst, packed); + src += 16; + dst += stride; + } + return; + } + + { + const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1)); + const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]); + const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]); + + __m128i row_0_lo = _mm_load_si128((__m128i *)src); + __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8)); + src += 16; + for (h = 0; h < 16; ++h) { + const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0); + const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0); + + const __m128i row_1_lo = _mm_load_si128((__m128i *)src); + const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8)); + const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1); + const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1); + + const __m128i sum_lo = + _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered); + const __m128i sum_hi = + _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered); + + const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor); + const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor); + + const __m128i shifted_lo = + _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT); + const __m128i shifted_hi = + _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT); + + const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi); + _mm_store_si128((__m128i *)dst, packed); + row_0_lo = row_1_lo; + row_0_hi = row_1_hi; + src += 16; + dst += stride; + } + } +} + +void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line, + int xoffset, int yoffset, uint8_t *dst_ptr, + int dst_pitch) { + uint16_t FData[16 * 17]; + + assert((xoffset | yoffset) != 0); + + horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset); + + vertical_16x16(FData, dst_ptr, dst_pitch, yoffset); +} + static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst, const int xoffset, const int height) { int h; diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm index f1ec55b27..51c015e3d 100644 --- a/vp8/common/x86/subpixel_sse2.asm +++ b/vp8/common/x86/subpixel_sse2.asm @@ -10,7 +10,6 @@ %include "vpx_ports/x86_abi_support.asm" -extern sym(vp8_bilinear_filters_x86_8) %define BLOCK_HEIGHT_WIDTH 4 %define VP8_FILTER_WEIGHT 128 @@ -958,274 +957,6 @@ sym(vp8_unpack_block1d16_h6_sse2): ret -;void vp8_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp8_bilinear_filters_x86_8) -global sym(vp8_bilinear_predict16x16_sse2) PRIVATE -sym(vp8_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] - ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] - - lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP8_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - SECTION_RODATA align 16 rd: