From 2d653411c2135377fb8c956e897880ff997b50ec Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Thu, 8 Jun 2017 21:14:08 +0200 Subject: [PATCH] x86: AVX-512 sub4x4_dct --- common/dct.c | 5 +++++ common/x86/dct-a.asm | 44 ++++++++++++++++++++++++++++++++++++++++++++ common/x86/dct.h | 1 + 3 files changed, 50 insertions(+) diff --git a/common/dct.c b/common/dct.c index 0452dcd0..a24e7a31 100644 --- a/common/dct.c +++ b/common/dct.c @@ -711,6 +711,11 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; #endif } + + if( cpu&X264_CPU_AVX512 ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_avx512; + } #endif //HAVE_MMX #if HAVE_ALTIVEC diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 412bade6..a5011e3f 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -610,6 +610,50 @@ cglobal sub16x16_dct, 3,3,6 DCT4_1D 0, 1, 2, 3, 4 STORE16_DCT_AVX2 0, 1, 2, 3, 4 ret + +%macro DCT4x4_AVX512 0 + psubw m0, m2 ; 0 1 + psubw m1, m3 ; 3 2 + SUMSUB_BA w, 1, 0, 2 + SBUTTERFLY wd, 1, 0, 2 + paddw m2, m1, m0 + psubw m3, m1, m0 + paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1 + psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3 + shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2 + punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1 + SUMSUB_BA w, 1, 2, 3 + shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2 + shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3 + paddw m2, m1, m3 + psubw m0, m1, m3 + paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1 + psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3' +%endmacro + +INIT_XMM avx512 +cglobal sub4x4_dct + mov eax, 0xf0aa + kmovw k1, eax + PROLOGUE 3,3 + movd m0, [r1+0*FENC_STRIDE] + movd m2, [r2+0*FDEC_STRIDE] + vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE] + vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE] + movd m1, [r1+3*FENC_STRIDE] + movd m3, [r2+3*FDEC_STRIDE] + vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE] + vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE] + kshiftrw k2, k1, 8 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m2, m4 + punpcklbw m1, m4 + punpcklbw m3, m4 + DCT4x4_AVX512 + mova [r0], m2 + mova [r0+16], m0 + RET %endif ; HIGH_BIT_DEPTH INIT_MMX diff --git a/common/x86/dct.h b/common/x86/dct.h index 206fbeea..41920f76 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -34,6 +34,7 @@ void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); -- 2.40.0