From 4b5e462d08fe17fa2a1133f30ea7167cb02b837a Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Thu, 19 May 2016 16:52:14 -0700 Subject: [PATCH] Upgrade vpx_lpf_{vertical,horizontal}_4 mmx to sse2 Followed the code style of other lpf fuctions. These 2 functions put 2 rows of data in a single xmm register, so they have similar but not identical filter operations, and cannot share the same macros. Change-Id: I3bab55a5d1a1232926ac8fd1f03251acc38302bc --- test/lpf_8_test.cc | 14 +- vpx_dsp/vpx_dsp.mk | 1 - vpx_dsp/vpx_dsp_rtcd_defs.pl | 4 +- vpx_dsp/x86/loopfilter_mmx.asm | 436 --------------------------------- vpx_dsp/x86/loopfilter_sse2.c | 207 ++++++++++++++++ 5 files changed, 213 insertions(+), 449 deletions(-) delete mode 100644 vpx_dsp/x86/loopfilter_mmx.asm diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 778a36ca3..94646e4ff 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -430,16 +430,6 @@ TEST_P(Loop8Test9Param, ValueCheck) { using std::tr1::make_tuple; -#if HAVE_MMX && CONFIG_USE_X86INC && !CONFIG_VP9_HIGHBITDEPTH -INSTANTIATE_TEST_CASE_P( - MMX, Loop8Test6Param, - ::testing::Values( - make_tuple(&vpx_lpf_horizontal_4_mmx, - &vpx_lpf_horizontal_4_c, 8), - make_tuple(&vpx_lpf_vertical_4_mmx, - &vpx_lpf_vertical_4_c, 8))); -#endif // HAVE_MMX - #if HAVE_SSE2 #if CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( @@ -497,12 +487,16 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, Loop8Test6Param, ::testing::Values( + make_tuple(&vpx_lpf_horizontal_4_sse2, + &vpx_lpf_horizontal_4_c, 8), make_tuple(&vpx_lpf_horizontal_8_sse2, &vpx_lpf_horizontal_8_c, 8), make_tuple(&vpx_lpf_horizontal_edge_8_sse2, &vpx_lpf_horizontal_edge_8_c, 8), make_tuple(&vpx_lpf_horizontal_edge_16_sse2, &vpx_lpf_horizontal_edge_16_c, 8), + make_tuple(&vpx_lpf_vertical_4_sse2, + &vpx_lpf_vertical_4_c, 8), make_tuple(&vpx_lpf_vertical_8_sse2, &vpx_lpf_vertical_8_c, 8), make_tuple(&vpx_lpf_vertical_16_sse2, diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 43802d76d..6fa3bc056 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -39,7 +39,6 @@ endif DSP_SRCS-yes += intrapred.c ifeq ($(CONFIG_USE_X86INC),yes) -DSP_SRCS-$(HAVE_MMX) += x86/loopfilter_mmx.asm DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index aeadbafc1..128221e98 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -543,7 +543,7 @@ specialize qw/vpx_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_vertical_8_dual_neon_asm=vpx_lpf_vertical_8_dual_neon; add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_vertical_4 neon dspr2 msa/, "$mmx_x86inc"; +specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/; @@ -564,7 +564,7 @@ specialize qw/vpx_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; $vpx_lpf_horizontal_8_dual_neon_asm=vpx_lpf_horizontal_8_dual_neon; add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/vpx_lpf_horizontal_4 neon dspr2 msa/, "$mmx_x86inc"; +specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/; add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/; diff --git a/vpx_dsp/x86/loopfilter_mmx.asm b/vpx_dsp/x86/loopfilter_mmx.asm deleted file mode 100644 index 45d0ecc0d..000000000 --- a/vpx_dsp/x86/loopfilter_mmx.asm +++ /dev/null @@ -1,436 +0,0 @@ -; -; Copyright (c) 2016 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -align 16 -tfe: - times 8 db 0xfe -t80: - times 8 db 0x80 -t3: - times 8 db 0x03 -t4: - times 8 db 0x04 -ones: - times 4 dw 0x0001 - -SECTION .text - -%define stkreg rsp - -%define t0 0 -%define t1 t0 + 16 -%define p1 t1 + 16 -%define p0 p1 + 16 -%define q0 p0 + 16 -%define q1 q0 + 16 -%define lstacksize q1 + 16 - -%define goffsetq _limitq - -;void vpx_lpf_horizontal_4_mmx(unsigned char *src_ptr, int src_pixel_step, -; const char *blimit, const char *limit, -; const char *thresh); -INIT_MMX mmx -cglobal lpf_horizontal_4, 5, 6, 8, 0 - lstacksize, \ - s, p, _blimit, _limit, _thresh, s1 - movq m7, [_limitq] - GET_GOT goffsetq -%if GET_GOT_DEFINED=1 - add rsp, gprsize ; restore stack -%endif - lea s1q, [sq + pq] ; s1q points to row +1 - - ; calculate breakout conditions - movq m2, [s1q + 2 * pq] ; q3 - movq m1, [ sq + 2 * pq] ; q2 - movq m6, m1 ; q2 - psubusb m1, m2 ; q2-=q3 - psubusb m2, m6 ; q3-=q2 - por m1, m2 ; abs(q3-q2) - psubusb m1, m7 - movq m4, [sq + pq] ; q1 - movq m3, m4 ; q1 - psubusb m4, m6 ; q1-=q2 - psubusb m6, m3 ; q2-=q1 - por m4, m6 ; abs(q2-q1) - psubusb m4, m7 - por m1, m4 - movq m4, [sq] ; q0 - movq m0, m4 ; q0 - psubusb m4, m3 ; q0-=q1 - psubusb m3, m0 ; q1-=q0 - por m4, m3 ; abs(q0-q1) - movq [stkreg + t0], m4 ; save to t0 - psubusb m4, m7 - por m1, m4 - neg pq ; negate pitch to deal with - ; above border - movq m2, [ sq + 4 * pq] ; p3 - movq m4, [s1q + 4 * pq] ; p2 - movq m5, m4 ; p2 - psubusb m4, m2 ; p2-=p3 - psubusb m2, m5 ; p3-=p2 - por m4, m2 ; abs(p3 - p2) - psubusb m4, m7 - por m1, m4 - movq m4, [sq + 2 * pq] ; p1 - movq m3, m4 ; p1 - psubusb m4, m5 ; p1-=p2 - psubusb m5, m3 ; p2-=p1 - por m4, m5 ; abs(p2 - p1) - psubusb m4, m7 - por m1, m4 - movq m2, m3 ; p1 - movq m4, [sq + pq] ; p0 - movq m5, m4 ; p0 - psubusb m4, m3 ; p0-=p1 - psubusb m3, m5 ; p1-=p0 - por m4, m3 ; abs(p1 - p0) - movq [stkreg + t1], m4 ; save to t1 - psubusb m4, m7 - por m1, m4 - movq m3, [s1q] ; q1 - movq m4, m3 ; q1 - psubusb m3, m2 ; q1-=p1 - psubusb m2, m4 ; p1-=q1 - por m2, m3 ; abs(p1-q1) - pand m2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw m2, 1 ; abs(p1-q1)/2 - movq m6, m5 ; p0 - movq m3, [sq] ; q0 - psubusb m5, m3 ; p0-=q0 - psubusb m3, m6 ; q0-=p0 - por m5, m3 ; abs(p0 - q0) - paddusb m5, m5 ; abs(p0-q0)*2 - paddusb m5, m2 ; abs (p0 - q0) * 2 + abs(p1-q1)/2 - movq m7, [_blimitq] ; blimit - psubusb m5, m7 ; abs (p0 - q0) * 2 + - ; abs(p1-q1)/2 > blimit - por m1, m5 - pxor m5, m5 - pcmpeqb m1, m5 ; mask m1 - - ; calculate high edge variance - movq m7, [_threshq] - movq m4, [stkreg + t0] ; get abs (q1 - q0) - psubusb m4, m7 - movq m3, [stkreg + t1] ; get abs (p1 - p0) - psubusb m3, m7 - paddb m4, m3 ; abs(q1 - q0) > thresh || - ; abs(p1 - p0) > thresh - pcmpeqb m4, m5 - pcmpeqb m5, m5 - movq m3, [GLOBAL(t80)] - pxor m4, m5 - - ; start work on filters - movq m2, [sq + 2 * pq] ; p1 - movq m7, [s1q] ; q1 - pxor m2, m3 ; p1 converted to signed values - pxor m7, m3 ; q1 converted to signed values - psubsb m2, m7 ; p1 - q1 - pand m2, m4 ; high var mask (hvm)(p1 - q1) - pxor m6, m3 ; p0 converted to signed values - pxor m0, m3 ; q0 converted to signed values - movq m3, m0 ; q0 - psubsb m0, m6 ; q0 - p0 - paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand m1, m2 ; mask filter values we don't - ; care about - movq m2, m1 - paddsb m1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb m2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - pxor m0, m0 - pxor m5, m5 - punpcklbw m0, m2 - punpckhbw m5, m2 - psraw m0, 11 - psraw m5, 11 - packsswb m0, m5 - movq m2, m0 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 3) >> 3; - pxor m0, m0 - movq m5, m1 ; abcdefgh - punpcklbw m0, m1 ; e0f0g0h0 - psraw m0, 11 ; sign extended shift right by 3 - pxor m1, m1 - punpckhbw m1, m5 ; a0b0c0d0 - psraw m1, 11 ; sign extended shift right by 3 - movq m5, m0 ; save results - - packsswb m0, m1 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 4) >>3 - paddsw m5, [GLOBAL(ones)] - paddsw m1, [GLOBAL(ones)] - psraw m5, 1 - psraw m1, 1 - packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 4) >>4 - movq m1, [GLOBAL(t80)] - pandn m4, m5 ; high edge variance additive - paddsb m6, m2 ; p0+= p0 add - pxor m6, m1 ; unoffset - movq [sq + pq], m6 ; write back - movq m6, [sq + 2 * pq] ; p1 - pxor m6, m1 ; reoffset - paddsb m6, m4 ; p1+= p1 add - pxor m6, m1 ; unoffset - movq [sq + 2 * pq], m6 ; write back - psubsb m3, m0 ; q0-= q0 add - pxor m3, m1 ; unoffset - movq [sq], m3 ; write back - psubsb m7, m4 ; q1-= q1 add - pxor m7, m1 ; unoffset - movq [s1q], m7 ; write back - RET - -;void vpx_lpf_vertical_4_mmx(unsigned char *src_ptr, int src_pixel_step, -; const char *blimit, const char *limit, -; const char *thresh); -INIT_MMX mmx -cglobal lpf_vertical_4, 5, 6, 8, 0 - lstacksize, \ - s, p, _blimit, _limit, _thresh, s1 - lea sq, [sq + pq * 4 - 4] - lea s1q, [sq + pq] ; s1q points to row +1 - ;transpose - movq m6, [ sq + 2 * pq] ; 67 66 65 64 63 62 61 60 - movq m7, m6 ; 77 76 75 74 73 72 71 70 - punpckhbw m7, [s1q + 2 * pq] ; 77 67 76 66 75 65 74 64 - punpcklbw m6, [s1q + 2 * pq] ; 73 63 72 62 71 61 70 60 - movq m4, [sq] ; 47 46 45 44 43 42 41 40 - movq m5, m4 ; 47 46 45 44 43 42 41 40 - punpckhbw m5, [sq + pq] ; 57 47 56 46 55 45 54 44 - punpcklbw m4, [sq + pq] ; 53 43 52 42 51 41 50 40 - movq m3, m5 ; 57 47 56 46 55 45 54 44 - punpckhwd m5, m7 ; 77 67 57 47 76 66 56 46 - punpcklwd m3, m7 ; 75 65 55 45 74 64 54 44 - movq m2, m4 ; 53 43 52 42 51 41 50 40 - punpckhwd m4, m6 ; 73 63 53 43 72 62 52 42 - punpcklwd m2, m6 ; 71 61 51 41 70 60 50 40 - neg pq - movq m6, [ sq + pq * 2] ; 27 26 25 24 23 22 21 20 - movq m1, m6 ; 27 26 25 24 23 22 21 20 - punpckhbw m6, [ sq + pq ] ; 37 27 36 36 35 25 34 24 - punpcklbw m1, [ sq + pq ] ; 33 23 32 22 31 21 30 20 - movq m7, [ sq + pq * 4]; ; 07 06 05 04 03 02 01 00 - punpckhbw m7, [s1q + pq * 4] ; 17 07 16 06 15 05 14 04 - movq m0, m7 ; 17 07 16 06 15 05 14 04 - punpckhwd m7, m6 ; 37 27 17 07 36 26 16 06 - punpcklwd m0, m6 ; 35 25 15 05 34 24 14 04 - movq m6, m7 ; 37 27 17 07 36 26 16 06 - punpckhdq m7, m5 ; 77 67 57 47 37 27 17 07 = q3 - punpckldq m6, m5 ; 76 66 56 46 36 26 16 06 = q2 - movq m5, m6 ; 76 66 56 46 36 26 16 06 - psubusb m5, m7 ; q2-q3 - psubusb m7, m6 ; q3-q2 - por m7, m5; ; m7=abs (q3-q2) - movq m5, m0 ; 35 25 15 05 34 24 14 04 - punpckhdq m5, m3 ; 75 65 55 45 35 25 15 05 = q1 - punpckldq m0, m3 ; 74 64 54 44 34 24 15 04 = q0 - movq m3, m5 ; 75 65 55 45 35 25 15 05 = q1 - psubusb m3, m6 ; q1-q2 - psubusb m6, m5 ; q2-q1 - por m6, m3 ; m6=abs(q2-q1) - - movq [stkreg + q1], m5 ; save q1 - movq [stkreg + q0], m0 ; save q0 - - movq m3, [ sq + pq * 4] ; 07 06 05 04 03 02 01 00 - punpcklbw m3, [s1q + pq * 4] ; 13 03 12 02 11 01 10 00 - movq m0, m3 ; 13 03 12 02 11 01 10 00 - punpcklwd m0, m1 ; 31 21 11 01 30 20 10 00 - punpckhwd m3, m1 ; 33 23 13 03 32 22 12 02 - movq m1, m0 ; 31 21 11 01 30 20 10 00 - punpckldq m0, m2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq m1, m2 ; 71 61 51 41 31 21 11 01 =p2 - movq m2, m1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb m2, m0 ; p2-p3 - psubusb m0, m1 ; p3-p2 - por m0, m2 ; m0=abs(p3-p2) - movq m2, m3 ; 33 23 13 03 32 22 12 02 - punpckldq m2, m4 ; 72 62 52 42 32 22 12 02 = p1 - punpckhdq m3, m4 ; 73 63 53 43 33 23 13 03 = p0 - - movq [stkreg + p0], m3 ; save p0 - movq [stkreg + p1], m2 ; save p1 - movq m5, m2 ; m5 = p1 - psubusb m2, m1 ; p1-p2 - psubusb m1, m5 ; p2-p1 - por m1, m2 ; m1=abs(p2-p1) - movq m4, [_limitq] - GET_GOT goffsetq -%if GET_GOT_DEFINED=1 - add rsp, gprsize ; restore stack -%endif - psubusb m7, m4 - psubusb m0, m4 - psubusb m1, m4 - psubusb m6, m4 - por m7, m6 - por m0, m1 - por m0, m7 ; abs(q3-q2) > limit || - ; abs(p3-p2) > limit || - ; abs(p2-p1) > limit || - ; abs(q2-q1) > limit - movq m1, m5 ; p1 - movq m7, m3 ; m3=m7=p0 - psubusb m7, m5 ; p0 - p1 - psubusb m5, m3 ; p1 - p0 - por m5, m7 ; abs(p1-p0) - movq [stkreg + t0], m5 ; save abs(p1-p0) - psubusb m5, m4 - por m0, m5 ; m0=mask - movq m5, [stkreg + q0] ; m5=q0 - movq m7, [stkreg + q1] ; m7=q1 - movq m6, m5 ; m6=q0 - movq m2, m7 ; q1 - psubusb m5, m7 ; q0-q1 - psubusb m7, m6 ; q1-q0 - por m7, m5 ; abs(q1-q0) - movq [stkreg + t1], m7 ; save abs(q1-q0) - psubusb m7, m4 - por m0, m7 ; mask - movq m5, m2 ; q1 - psubusb m5, m1 ; q1-=p1 - psubusb m1, m2 ; p1-=q1 - por m5, m1 ; abs(p1-q1) - pand m5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw m5, 1 ; abs(p1-q1)/2 - movq m4, [_blimitq] - movq m1, m3 ; m1=m3=p0 - movq m7, m6 ; m7=m6=q0 - psubusb m1, m7 ; p0-q0 - psubusb m7, m3 ; q0-p0 - por m1, m7 ; abs(q0-p0) - paddusb m1, m1 ; abs(q0-p0)*2 - paddusb m1, m5 ; abs(p0 - q0)*2 + abs(p1-q1)/2 - psubusb m1, m4 ; abs(p0 - q0)*2 + abs(p1-q1)/2 - ; > blimit - por m1, m0; ; mask - pxor m0, m0 - pcmpeqb m1, m0 - - ; calculate high edge variance - movq m7, [_threshq] - movq m4, [stkreg + t0] ; get abs (q1 - q0) - psubusb m4, m7 - movq m3, [stkreg + t1] ; get abs (p1 - p0) - psubusb m3, m7 - por m4, m3 ; abs(q1 - q0) > thresh || - ; abs(p1 - p0) > thresh - pcmpeqb m4, m0 - pcmpeqb m0, m0 - movq m3, [GLOBAL(t80)] - pxor m4, m0 - - ; start work on filters - movq m2, [stkreg + p1] - movq m7, [stkreg + q1] - movq m6, [stkreg + p0] - movq m0, [stkreg + q0] - pxor m2, m3 - pxor m7, m3 - psubsb m2, m7 ; p1 - q1 - pand m2, m4 ; high var mask (hvm)(p1 - q1) - pxor m6, m3 - pxor m0, m3 - movq m3, m0 ; q0 - psubsb m0, m6 ; q0 - p0 - paddsb m2, m0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb m2, m0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand m1, m2 ; mask filter values we don't - ; care about - movq m2, m1 - paddsb m1, [GLOBAL(t4)] ; 3*(q0 - p0) + hvm(p1 - q1) + 4 - paddsb m2, [GLOBAL(t3)] ; 3*(q0 - p0) + hvm(p1 - q1) + 3 - pxor m0, m0 - pxor m5, m5 - punpcklbw m0, m2 - punpckhbw m5, m2 - psraw m0, 11 - psraw m5, 11 - packsswb m0, m5 - movq m2, m0 ; (3*(q0 - p0) + hvm(p1 - q1) - ; + 3) >> 3; - pxor m0, m0 - movq m5, m1 ; abcdefgh - punpcklbw m0, m1 ; e0f0g0h0 - psraw m0, 11 ; sign extended shift right by 3 - pxor m1, m1 - punpckhbw m1, m5 ; a0b0c0d0 - psraw m1, 11 ; sign extended shift right by 3 - movq m5, m0 ; save results - packsswb m0, m1 ; (3*(q0 - p0) + hvm(p1 - q1) - ; + 4) >>3 - paddsw m5, [GLOBAL(ones)] - paddsw m1, [GLOBAL(ones)] - psraw m5, 1 - psraw m1, 1 - packsswb m5, m1 ; (3* (q0 - p0) + hvm(p1 - q1) - ; + 4) >>4 - pandn m4, m5 ; high edge variance additive - movq m5, [GLOBAL(t80)] - paddsb m6, m2 ; p0+= p0 add - pxor m6, m5 ; unoffset - ; m6=p0 - movq m1, [stkreg + p1] - pxor m1, m5 ; reoffset - paddsb m1, m4 ; p1+= p1 add - pxor m1, m5 ; unoffset - ; m6 = p0 m1 = p1 - psubsb m3, m0 ; q0-= q0 add - pxor m3, m5 ; unoffset - ; m3 = q0 - psubsb m7, m4 ; q1-= q1 add - pxor m7, m5 ; unoffset - ; m7 = q1 - ; transpose and write back - ; m1 = 72 62 52 42 32 22 12 02 - ; m6 = 73 63 53 43 33 23 13 03 - ; m3 = 74 64 54 44 34 24 14 04 - ; m7 = 75 65 55 45 35 25 15 05 - movq m2, m1 ; 72 62 52 42 32 22 12 02 - punpcklbw m2, m6 ; 33 32 23 22 13 12 03 02 - movq m4, m3 ; 74 64 54 44 34 24 14 04 - punpckhbw m1, m6 ; 73 72 63 62 53 52 43 42 - punpcklbw m4, m7 ; 35 34 25 24 15 14 05 04 - punpckhbw m3, m7 ; 75 74 65 64 55 54 45 44 - movq m6, m2 ; 33 32 23 22 13 12 03 02 - punpcklwd m2, m4 ; 15 14 13 12 05 04 03 02 - punpckhwd m6, m4 ; 35 34 33 32 25 24 23 22 - movq m5, m1 ; 73 72 63 62 53 52 43 42 - punpcklwd m1, m3 ; 55 54 53 52 45 44 43 42 - punpckhwd m5, m3 ; 75 74 73 72 65 64 63 62 - - ; m2 = 15 14 13 12 05 04 03 02 - ; m6 = 35 34 33 32 25 24 23 22 - ; m5 = 55 54 53 52 45 44 43 42 - ; m1 = 75 74 73 72 65 64 63 62 - movd [sq + pq * 4 + 2], m2 - psrlq m2, 32 - movd [s1q + pq * 4 + 2], m2 - movd [sq + pq * 2 + 2], m6 - psrlq m6, 32 - movd [sq + pq + 2], m6 - movd [sq + 2], m1 - psrlq m1, 32 - movd [s1q + 2], m1 - neg pq - movd [s1q + pq + 2], m5 - psrlq m5, 32 - movd [s1q + pq * 2 + 2], m5 - RET diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index e03508a03..39a6ae3a8 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -18,6 +18,213 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } +// filter_mask and hev_mask +#define FILTER_HEV_MASK do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1, work; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask(*limit, *blimit, */ \ + /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */\ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */\ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + /* abs(p3 - p2), abs(p2 - p1) */ \ + work = abs_diff(p3p2, p2p1); \ + flat = _mm_max_epu8(work, flat); \ + /* abs(q3 - q2), abs(q2 - q1) */ \ + work = abs_diff(q3q2, q2q1); \ + flat = _mm_max_epu8(work, flat); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ +} while (0) + +#define FILTER4 do { \ + const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, \ + 4, 4, 4, 4, 4, 4, 4, 4); \ + const __m128i t80 = _mm_set1_epi8(0x80); \ + __m128i filter, filter2filter1, work; \ + \ + ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ + qs1qs0 = _mm_xor_si128(q1q0, t80); \ + \ + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ + work = _mm_subs_epi8(ps1ps0, qs1qs0); \ + filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ + filter = _mm_and_si128(filter, mask); /* & mask */ \ + filter = _mm_unpacklo_epi64(filter, filter); \ + \ + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ + \ + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ + filter = _mm_unpacklo_epi8(filter, filter); \ + filter = _mm_srai_epi16(filter, 9); /* round */ \ + filter = _mm_packs_epi16(filter, filter); \ + filter = _mm_andnot_si128(hev, filter); \ + \ + hev = _mm_unpackhi_epi64(filter2filter1, filter); \ + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ + \ + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ + qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ + ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ + qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ + ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ +} while (0) + +void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + p3p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s - 4 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s + 0 * p))); + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +} + +void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; + __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + + // Transpose 8x8 + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + p1p0 = _mm_unpacklo_epi16(q1q0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x0 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + p3p2 = _mm_unpacklo_epi32(p1p0, x0); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + p1p0 = _mm_unpackhi_epi32(p1p0, x0); + p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high + p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + q1q0 = _mm_unpackhi_epi16(q1q0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x2 = _mm_unpackhi_epi16(x2, x3); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + q3q2 = _mm_unpackhi_epi32(q1q0, x2); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + q1q0 = _mm_unpacklo_epi32(q1q0, x2); + + q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); + q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); + + FILTER_HEV_MASK; + FILTER4; + + // Transpose 8x4 to 4x8 + // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 + // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); + // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 + x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); + // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); + + *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + + *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +} + void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, -- 2.40.0