From 195883023bb39b5ee5c6811a316ab96d9225034d Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 2 Sep 2015 13:15:52 -0700 Subject: [PATCH] VPX: subpixel_8t_ssse3 asm using x86inc This is based on the original patch optimized for 32bit platforms by Tamar/Ilya and now uses the x86inc style asm. The assembly was also modified to support 64bit platforms. Change-Id: Ice12f249bbbc162a7427e3d23fbf0cbe4135aff2 --- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 133 -- vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm | 1628 ++++++++------------ 3 files changed, 614 insertions(+), 1148 deletions(-) diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 61670d922..98a3d3401 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -43,6 +43,7 @@ ifeq ($(CONFIG_USE_X86INC),yes) DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm endif # CONFIG_USE_X86INC ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 48817581d..8d5c7c2dd 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -203,125 +203,6 @@ void vpx_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr, } } -#if ARCH_X86_64 -static void vpx_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, - ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); - - for (i = 0; i < output_height; i++) { - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); - - // filter the source buffer - srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - - // reading the next 16 bytes. - // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - // filter the source buffer - srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); - - src_ptr+=src_pixels_per_line; - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); - - output_ptr+=output_pitch; - } -} -#endif // ARCH_X86_64 - void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, @@ -527,26 +408,12 @@ static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, } #endif // ARCH_X86_64 -#if ARCH_X86_64 -filter8_1dfunction vpx_filter_block1d16_v8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d16_h8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; -filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3; -#define vpx_filter_block1d16_v8_ssse3 vpx_filter_block1d16_v8_intrin_ssse3 -#define vpx_filter_block1d16_h8_ssse3 vpx_filter_block1d16_h8_intrin_ssse3 -#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3 -#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3 -#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 filter8_1dfunction vpx_filter_block1d16_v8_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_ssse3; filter8_1dfunction vpx_filter_block1d8_v8_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_ssse3; filter8_1dfunction vpx_filter_block1d4_v8_ssse3; filter8_1dfunction vpx_filter_block1d4_h8_ssse3; -#endif // ARCH_X86_64 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3; diff --git a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm index 68acc03ce..7ea6a0e58 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm +++ b/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm @@ -1,5 +1,5 @@ ; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source @@ -8,1064 +8,662 @@ ; be found in the AUTHORS file in the root of the source tree. ; +%include "third_party/x86inc/x86inc.asm" -%include "vpx_ports/x86_abi_support.asm" - -%macro VERTx4 1 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.loop: - movd xmm0, [rsi] ;A - movd xmm1, [rsi + rdx] ;B - movd xmm2, [rsi + rdx * 2] ;C - movd xmm3, [rax + rdx * 2] ;D - movd xmm4, [rsi + rdx * 4] ;E - movd xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movd xmm6, [rsi + rbx] ;G - movd xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - movdqa xmm1, xmm2 - paddsw xmm0, xmm6 - pmaxsw xmm2, xmm4 - pminsw xmm4, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 +SECTION_RODATA +pw_64: times 8 dw 64 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. - add rsi, rdx - add rax, rdx -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 +SECTION .text +%if ARCH_X86_64 + %define LOCAL_VARS_SIZE 16*4 +%else + %define LOCAL_VARS_SIZE 16*6 %endif - movd [rdi], xmm0 -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if ARCH_X86_64 + %define krd m12 + %define tmp m13 + mova krd, [GLOBAL(pw_64)] +%else + %define tmp [rsp + 16*4] + %define krd [rsp + 16*5] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] %else - add rdi, r8 + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 %endif - dec rcx - jnz .loop -%endm - -%macro VERTx8 1 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch + mova krd, m6 %endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm6 - movdqa xmm1, xmm2 - pmaxsw xmm2, xmm4 - pminsw xmm4, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 +%endm - add rsi, rdx - add rax, rdx -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 +%macro HORIZx4_ROW 2 + mova %2, %1 + punpcklbw %1, %1 + punpckhbw %2, %2 + + mova m3, %2 + palignr %2, %1, 1 + palignr m3, %1, 5 + + pmaddubsw %2, k0k1k4k5 + pmaddubsw m3, k2k3k6k7 + + mova m4, %2 + mova m5, m3 + psrldq %2, 8 + psrldq m3, 8 + mova m6, m5 + + paddsw m4, m3 + pmaxsw m5, %2 + pminsw %2, m6 + paddsw %2, m4 + paddsw %2, m5 + paddsw %2, krd + psraw %2, 7 + packuswb %2, %2 +%endm -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + %define orig_height r7 + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 %else - add rdi, r8 + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + %define orig_height [rsp + 16*3] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 %endif - dec rcx - jnz .loop -%endm - - -%macro VERTx16 1 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 %endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - + mov orig_height, heightq + shr heightq, 1 .loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm6 - movdqa xmm1, xmm2 - pmaxsw xmm2, xmm4 - pminsw xmm4, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 + ;Do two rows at once + movh m0, [srcq - 3] + movh m1, [srcq + 5] + punpcklqdq m0, m1 + mova m1, m0 + movh m2, [srcq + sstrideq - 3] + movh m3, [srcq + sstrideq + 5] + punpcklqdq m2, m3 + mova m3, m2 + punpcklbw m0, m0 + punpckhbw m1, m1 + punpcklbw m2, m2 + punpckhbw m3, m3 + mova m4, m1 + palignr m4, m0, 1 + pmaddubsw m4, k0k1k4k5 + palignr m1, m0, 5 + pmaddubsw m1, k2k3k6k7 + mova m7, m3 + palignr m7, m2, 1 + pmaddubsw m7, k0k1k4k5 + palignr m3, m2, 5 + pmaddubsw m3, k2k3k6k7 + mova m0, m4 + mova m5, m1 + mova m2, m7 + psrldq m4, 8 + psrldq m1, 8 + mova m6, m5 + paddsw m0, m1 + mova m1, m3 + psrldq m7, 8 + psrldq m3, 8 + paddsw m2, m3 + mova m3, m1 + pmaxsw m5, m4 + pminsw m4, m6 + paddsw m4, m0 + paddsw m4, m5 + pmaxsw m1, m7 + pminsw m7, m3 + paddsw m7, m2 + paddsw m7, m1 + + paddsw m4, krd + psraw m4, 7 + packuswb m4, m4 + paddsw m7, krd + psraw m7, 7 + packuswb m7, m7 + +%ifidn %1, h8_avg + movd m0, [dstq] + pavgb m4, m0 + movd m2, [dstq + dstrideq] + pavgb m7, m2 %endif - movq [rdi], xmm0 - - movq xmm0, [rsi + 8] ;A - movq xmm1, [rsi + rdx + 8] ;B - movq xmm2, [rsi + rdx * 2 + 8] ;C - movq xmm3, [rax + rdx * 2 + 8] ;D - movq xmm4, [rsi + rdx * 4 + 8] ;E - movq xmm5, [rax + rdx * 4 + 8] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F + movd [dstq], m4 + movd [dstq + dstrideq], m7 - movq xmm6, [rsi + rbx + 8] ;G - movq xmm7, [rax + rbx + 8] ;H - punpcklbw xmm6, xmm7 ;G H + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 + dec heightq + jnz .loop - paddsw xmm0, xmm6 - movdqa xmm1, xmm2 - pmaxsw xmm2, xmm4 - pminsw xmm4, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 - - add rsi, rdx - add rax, rdx -%if %1 - movq xmm1, [rdi+8] - pavgb xmm0, xmm1 -%endif - - movq [rdi+8], xmm0 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 + ; Do last row if output_height is odd + mov heightq, orig_height + and heightq, 1 + je .done + + movh m0, [srcq - 3] ; load src + movh m1, [srcq + 5] + punpcklqdq m0, m1 + + HORIZx4_ROW m0, m1 +%ifidn %1, h8_avg + movd m0, [dstq] + pavgb m1, m0 %endif - dec rcx - jnz .loop + movd [dstq], m1 +.done + RET %endm -;void vpx_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE -sym(vpx_filter_block1d4_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - VERTx4 0 - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE -sym(vpx_filter_block1d8_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - VERTx8 0 - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d16_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE -sym(vpx_filter_block1d16_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - VERTx16 0 - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE -sym(vpx_filter_block1d4_v8_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - VERTx4 1 - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE -sym(vpx_filter_block1d8_v8_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - VERTx8 1 - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE -sym(vpx_filter_block1d16_v8_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - VERTx16 1 - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%macro HORIZx4_ROW 2 - movdqa %2, %1 - pshufb %1, [GLOBAL(shuf_t0t1)] - pshufb %2, [GLOBAL(shuf_t2t3)] - pmaddubsw %1, k0k1k4k5 - pmaddubsw %2, k2k3k6k7 - - movdqa xmm4, %1 - movdqa xmm5, %2 - psrldq %1, 8 - psrldq %2, 8 - movdqa xmm6, xmm5 - - paddsw xmm4, %2 - pmaxsw xmm5, %1 - pminsw %1, xmm6 - paddsw %1, xmm4 - paddsw %1, xmm5 - - paddsw %1, krd - psraw %1, 7 - packuswb %1, %1 +%macro HORIZx8_ROW 5 + mova %2, %1 + punpcklbw %1, %1 + punpckhbw %2, %2 + + mova %3, %2 + mova %4, %2 + mova %5, %2 + + palignr %2, %1, 1 + palignr %3, %1, 5 + palignr %4, %1, 9 + palignr %5, %1, 13 + + pmaddubsw %2, k0k1 + pmaddubsw %3, k2k3 + pmaddubsw %4, k4k5 + pmaddubsw %5, k6k7 + + paddsw %2, %5 + mova %1, %3 + pminsw %3, %4 + pmaxsw %1, %4 + paddsw %2, %3 + paddsw %1, %2 + paddsw %1, krd + psraw %1, 7 + packuswb %1, %1 %endm -%macro HORIZx4 1 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm6, xmm4, 0b ;k0_k1 - pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 - pshuflw xmm7, xmm4, 01010101b ;k2_k3 - pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 - pshufd xmm5, xmm5, 0 ;rounding - - movdqa k0k1k4k5, xmm6 - movdqa k2k3k6k7, xmm7 - movdqa krd, xmm5 +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS +%if ARCH_X86_64 + %define orig_height r7 +%else + %define orig_height heightmp +%endif + mov orig_height, heightq + shr heightq, 1 - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - shr rcx, 1 .loop: - ;Do two rows once - movq xmm0, [rsi - 3] ;load src - movq xmm1, [rsi + 5] - movq xmm2, [rsi + rax - 3] - movq xmm3, [rsi + rax + 5] - punpcklqdq xmm0, xmm1 - punpcklqdq xmm2, xmm3 - - HORIZx4_ROW xmm0, xmm1 - HORIZx4_ROW xmm2, xmm3 -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 - movd xmm3, [rdi + rdx] - pavgb xmm2, xmm3 + movh m0, [srcq - 3] + movh m3, [srcq + 5] + movh m4, [srcq + sstrideq - 3] + movh m7, [srcq + sstrideq + 5] + punpcklqdq m0, m3 + mova m1, m0 + punpcklbw m0, m0 + punpckhbw m1, m1 + mova m5, m1 + palignr m5, m0, 13 + pmaddubsw m5, k6k7 + mova m2, m1 + mova m3, m1 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpcklqdq m4, m7 + mova m6, m4 + punpcklbw m4, m4 + palignr m2, m0, 5 + punpckhbw m6, m6 + palignr m3, m0, 9 + mova m7, m6 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m4, 13 + paddsw m1, m5 + mova m5, m6 + mova m0, m2 + palignr m5, m4, 5 + pminsw m2, m3 + pmaddubsw m7, k6k7 + pmaxsw m3, m0 + paddsw m1, m2 + mova m0, m6 + palignr m6, m4, 1 + pmaddubsw m5, k2k3 + paddsw m1, m3 + pmaddubsw m6, k0k1 + palignr m0, m4, 9 + paddsw m1, krd + pmaddubsw m0, k4k5 + mova m4, m5 + psraw m1, 7 + pminsw m5, m0 + paddsw m6, m7 + packuswb m1, m1 + + paddsw m6, m5 + pmaxsw m0, m4 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 + packuswb m6, m6 + +%ifidn %1, h8_avg + movh m0, [dstq] + movh m2, [dstq + dstrideq] + pavgb m1, m0 + pavgb m6, m2 %endif - movd [rdi], xmm0 - movd [rdi +rdx], xmm2 + movh [dstq], m1 + movh [dstq + dstrideq], m6 - lea rsi, [rsi + rax] - prefetcht0 [rsi + 4 * rax - 3] - lea rsi, [rsi + rax] - lea rdi, [rdi + 2 * rdx] - prefetcht0 [rsi + 2 * rax - 3] + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + dec heightq + jnz .loop - dec rcx - jnz .loop + ;Do last row if output_height is odd + mov heightq, orig_height + and heightq, 1 + je .done - ; Do last row if output_height is odd - movsxd rcx, dword ptr arg(4) ;output_height - and rcx, 1 - je .done + movh m0, [srcq - 3] + movh m3, [srcq + 5] + punpcklqdq m0, m3 - movq xmm0, [rsi - 3] ; load src - movq xmm1, [rsi + 5] - punpcklqdq xmm0, xmm1 + HORIZx8_ROW m0, m1, m2, m3, m4 - HORIZx4_ROW xmm0, xmm1 -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 +%ifidn %1, h8_avg + movh m1, [dstq] + pavgb m0, m1 %endif - movd [rdi], xmm0 -.done + movh [dstq], m0 +.done: + RET %endm -%macro HORIZx8_ROW 4 - movdqa %2, %1 - movdqa %3, %1 - movdqa %4, %1 - - pshufb %1, [GLOBAL(shuf_t0t1)] - pshufb %2, [GLOBAL(shuf_t2t3)] - pshufb %3, [GLOBAL(shuf_t4t5)] - pshufb %4, [GLOBAL(shuf_t6t7)] - - pmaddubsw %1, k0k1 - pmaddubsw %2, k2k3 - pmaddubsw %3, k4k5 - pmaddubsw %4, k6k7 - - paddsw %1, %4 - movdqa %4, %2 - pmaxsw %2, %3 - pminsw %3, %4 - paddsw %1, %3 - paddsw %1, %2 - - paddsw %1, krd - psraw %1, 7 - packuswb %1, %1 +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movh m0, [srcq - 3] + movh m4, [srcq + 5] + movh m6, [srcq + 13] + punpcklqdq m0, m4 + mova m7, m0 + punpckhbw m0, m0 + mova m1, m0 + punpcklqdq m4, m6 + mova m3, m0 + punpcklbw m7, m7 + + palignr m3, m7, 13 + mova m2, m0 + pmaddubsw m3, k6k7 + palignr m0, m7, 1 + pmaddubsw m0, k0k1 + palignr m1, m7, 5 + pmaddubsw m1, k2k3 + palignr m2, m7, 9 + pmaddubsw m2, k4k5 + paddsw m0, m3 + mova m3, m4 + punpckhbw m4, m4 + mova m5, m4 + punpcklbw m3, m3 + mova m7, m4 + palignr m5, m3, 5 + mova m6, m4 + palignr m4, m3, 1 + pmaddubsw m4, k0k1 + pmaddubsw m5, k2k3 + palignr m6, m3, 9 + pmaddubsw m6, k4k5 + palignr m7, m3, 13 + pmaddubsw m7, k6k7 + + mova m3, m1 + pmaxsw m1, m2 + pminsw m2, m3 + paddsw m0, m2 + paddsw m0, m1 + paddsw m4, m7 + mova m7, m5 + pmaxsw m5, m6 + pminsw m6, m7 + paddsw m4, m6 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 + packuswb m0, m4 +%ifidn %1, h8_avg + mova m1, [dstq] + pavgb m0, m1 +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightq + jnz .loop + RET %endm -%macro HORIZx8 1 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - shr rcx, 1 +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 +SUBPIX_HFILTER16 h8_avg +SUBPIX_HFILTER8 h8 +SUBPIX_HFILTER8 h8_avg +SUBPIX_HFILTER4 h8 +SUBPIX_HFILTER4 h8_avg + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif .loop: - movq xmm0, [rsi - 3] ;load src - movq xmm3, [rsi + 5] - movq xmm4, [rsi + rax - 3] - movq xmm7, [rsi + rax + 5] - punpcklqdq xmm0, xmm3 - punpcklqdq xmm4, xmm7 - - HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 - HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 -%if %1 - movq xmm1, [rdi] - movq xmm2, [rdi + rdx] - pavgb xmm0, xmm1 - pavgb xmm4, xmm2 + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + mova tmp, m2 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + pmaxsw m2, m4 + paddsw m0, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + pminsw m4, tmp + paddsw m0, m4 + mova m4, m3 + paddsw m0, m2 + pminsw m3, m5 + pmaxsw m5, m4 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m7 + packuswb m0, m0 + + paddsw m1, m3 + paddsw m1, m5 + paddsw m1, krd + psraw m1, 7 + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 %endif - movq [rdi], xmm0 - movq [rdi + rdx], xmm4 - - lea rsi, [rsi + rax] - prefetcht0 [rsi + 4 * rax - 3] - lea rsi, [rsi + rax] - lea rdi, [rdi + 2 * rdx] - prefetcht0 [rsi + 2 * rax - 3] - dec rcx - jnz .loop - - ;Do last row if output_height is odd - movsxd rcx, dword ptr arg(4) ;output_height - and rcx, 1 - je .done - - movq xmm0, [rsi - 3] - movq xmm3, [rsi + 5] - punpcklqdq xmm0, xmm3 - - HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 %endif - movq [rdi], xmm0 -.done + movx [dstq], m1 + add dstq, dst_stride + sub heightq, 2 + cmp heightq, 1 + jg .loop + + cmp heightq, 0 + je .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [rax + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [rax + sstrideq * 2 ] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m0, m6 + mova m1, m2 + pmaxsw m2, m4 + pminsw m4, m1 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 +.done: + RET %endm -%macro HORIZx16 1 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movq xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + + mova m4, [filterq] + SETUP_LOCAL_VARS +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 .loop: - prefetcht0 [rsi + 2 * rax -3] - - movq xmm0, [rsi - 3] ;load src data - movq xmm4, [rsi + 5] - movq xmm6, [rsi + 13] - punpcklqdq xmm0, xmm4 - punpcklqdq xmm4, xmm6 - - movdqa xmm7, xmm0 - - punpcklbw xmm7, xmm7 - punpckhbw xmm0, xmm0 - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - - palignr xmm0, xmm7, 1 - palignr xmm1, xmm7, 5 - pmaddubsw xmm0, k0k1 - palignr xmm2, xmm7, 9 - pmaddubsw xmm1, k2k3 - palignr xmm3, xmm7, 13 - - pmaddubsw xmm2, k4k5 - pmaddubsw xmm3, k6k7 - paddsw xmm0, xmm3 - - movdqa xmm3, xmm4 - punpcklbw xmm3, xmm3 - punpckhbw xmm4, xmm4 - - movdqa xmm5, xmm4 - movdqa xmm6, xmm4 - movdqa xmm7, xmm4 - - palignr xmm4, xmm3, 1 - palignr xmm5, xmm3, 5 - palignr xmm6, xmm3, 9 - palignr xmm7, xmm3, 13 - - movdqa xmm3, xmm1 - pmaddubsw xmm4, k0k1 - pmaxsw xmm1, xmm2 - pmaddubsw xmm5, k2k3 - pminsw xmm2, xmm3 - pmaddubsw xmm6, k4k5 - paddsw xmm0, xmm2 - pmaddubsw xmm7, k6k7 - paddsw xmm0, xmm1 - - paddsw xmm4, xmm7 - movdqa xmm7, xmm5 - pmaxsw xmm5, xmm6 - pminsw xmm6, xmm7 - paddsw xmm4, xmm6 - paddsw xmm4, xmm5 - - paddsw xmm0, krd - paddsw xmm4, krd - psraw xmm0, 7 - psraw xmm4, 7 - packuswb xmm0, xmm0 - packuswb xmm4, xmm4 - punpcklqdq xmm0, xmm4 -%if %1 - movdqa xmm1, [rdi] - pavgb xmm0, xmm1 + movh m0, [srcq ] ;A + movh m1, [srcq + sstrideq ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + mova m1, m2 + movh m5, [src1q + sstrideq * 2 + 8] ;D + pmaxsw m2, m4 + punpcklbw m7, m5 ;C D + pminsw m4, m1 + paddsw m0, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + packuswb m0, m0 + pmaddubsw m2, k6k7 +%ifidn %1, v8_avg + movh m4, [dstq] + pavgb m0, m4 %endif - - lea rsi, [rsi + rax] - movdqa [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .loop + movh [dstq], m0 + mova m6, m7 + pmaxsw m7, m1 + pminsw m1, m6 + paddsw m3, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 + packuswb m3, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + movh m1, [dstq + 8] + pavgb m3, m1 +%endif + movh [dstq + 8], m3 + add dstq, dst_stride + dec heightq + jnz .loop + RET %endm -;void vpx_filter_block1d4_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE -sym(vpx_filter_block1d4_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 3 - %define k0k1k4k5 [rsp + 16 * 0] - %define k2k3k6k7 [rsp + 16 * 1] - %define krd [rsp + 16 * 2] - - HORIZx4 0 - - add rsp, 16 * 3 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d8_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE -sym(vpx_filter_block1d8_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - HORIZx8 0 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vpx_filter_block1d16_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE -sym(vpx_filter_block1d16_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - HORIZx16 0 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE -sym(vpx_filter_block1d4_h8_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 3 - %define k0k1k4k5 [rsp + 16 * 0] - %define k2k3k6k7 [rsp + 16 * 1] - %define krd [rsp + 16 * 2] - - HORIZx4 1 - - add rsp, 16 * 3 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE -sym(vpx_filter_block1d8_h8_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - HORIZx8 1 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE -sym(vpx_filter_block1d16_h8_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - HORIZx16 1 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -SECTION_RODATA -align 16 -shuf_t0t1: - db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -align 16 -shuf_t2t3: - db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -align 16 -shuf_t4t5: - db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -align 16 -shuf_t6t7: - db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 +SUBPIX_VFILTER16 v8_avg +SUBPIX_VFILTER v8, 8 +SUBPIX_VFILTER v8_avg, 8 +SUBPIX_VFILTER v8, 4 +SUBPIX_VFILTER v8_avg, 4 -- 2.40.0