]> granicus.if.org Git - libx264/commitdiff
x86: AVX-512 coeff_last
authorHenrik Gramner <henrik@gramner.com>
Mon, 27 Mar 2017 16:19:53 +0000 (18:19 +0200)
committerHenrik Gramner <henrik@gramner.com>
Sun, 21 May 2017 20:42:17 +0000 (22:42 +0200)
common/quant.c
common/x86/quant-a.asm
common/x86/quant.h
tools/checkasm.c

index d14144451144e05d6b8732f7363fb33703752705..8350fa7aabc3356547eec3c63e2c00da9b2c4e14 100644 (file)
@@ -558,6 +558,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->denoise_dct = x264_denoise_dct_avx2;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
     }
+    if( cpu&X264_CPU_AVX512 )
+    {
+        pf->coeff_last4 = x264_coeff_last4_avx512;
+        pf->coeff_last8 = x264_coeff_last8_avx512;
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
@@ -717,6 +725,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
     }
+    if( cpu&X264_CPU_AVX512 )
+    {
+        pf->coeff_last8 = x264_coeff_last8_avx512;
+        pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
+    }
 #endif // HAVE_MMX
 
 #if HAVE_ALTIVEC
index a20d1cd943790ebf4123ddfd5a22ea2978f0a76b..d281f02d465e9c9e370fb8c76002d9511f134773 100644 (file)
@@ -1756,6 +1756,70 @@ cglobal coeff_last64, 1,3
     RET
 %endif
 
+%macro COEFF_LAST_AVX512 2 ; num, w/d
+cglobal coeff_last%1, 1,2
+    mova         m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
+    vptestm%2    k0, m0, m0
+%if %1 == 15
+    mov         eax, 30
+    kmovw       r1d, k0
+    lzcnt       r1d, r1d
+    sub         eax, r1d
+%else
+    kmovw       eax, k0
+    lzcnt       eax, eax
+    xor         eax, 31
+%endif
+    RET
+%endmacro
+
+%macro COEFF_LAST64_AVX512 1 ; w/d
+cglobal coeff_last64, 1,2
+    pxor        xm0, xm0
+    vpcmp%1      k0, m0, [r0+0*64], 4
+    vpcmp%1      k1, m0, [r0+1*64], 4
+%if HIGH_BIT_DEPTH
+    vpcmp%1      k2, m0, [r0+2*64], 4
+    vpcmp%1      k3, m0, [r0+3*64], 4
+    kunpckwd     k0, k1, k0
+    kunpckwd     k1, k3, k2
+%endif
+%if ARCH_X86_64
+    kunpckdq     k0, k1, k0
+    kmovq       rax, k0
+    lzcnt       rax, rax
+    xor         eax, 63
+%else
+    kmovd       r1d, k1
+    kmovd       eax, k0
+    lzcnt       r1d, r1d
+    lzcnt       eax, eax
+    xor         r1d, 32
+    cmovnz      eax, r1d
+    xor         eax, 31
+%endif
+    RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512  4, d
+INIT_YMM avx512
+COEFF_LAST_AVX512  8, d
+INIT_ZMM avx512
+COEFF_LAST_AVX512 15, d
+COEFF_LAST_AVX512 16, d
+COEFF_LAST64_AVX512 d
+%else ; !HIGH_BIT_DEPTH
+INIT_XMM avx512
+COEFF_LAST_AVX512  8, w
+INIT_YMM avx512
+COEFF_LAST_AVX512 15, w
+COEFF_LAST_AVX512 16, w
+INIT_ZMM avx512
+COEFF_LAST64_AVX512 w
+%endif ; !HIGH_BIT_DEPTH
+
 ;-----------------------------------------------------------------------------
 ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
 ;-----------------------------------------------------------------------------
index e0ce0f2311df9318a23643f206d7e0b0292766d9..4c6ba337e838511114505557b3dcdde51f194165 100644 (file)
@@ -110,6 +110,11 @@ int x264_coeff_last15_lzcnt( dctcoef *dct );
 int x264_coeff_last16_lzcnt( dctcoef *dct );
 int x264_coeff_last64_lzcnt( dctcoef *dct );
 int x264_coeff_last64_avx2 ( dctcoef *dct );
+int x264_coeff_last4_avx512( int32_t *dct );
+int x264_coeff_last8_avx512( dctcoef *dct );
+int x264_coeff_last15_avx512( dctcoef *dct );
+int x264_coeff_last16_avx512( dctcoef *dct );
+int x264_coeff_last64_avx512( dctcoef *dct );
 int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
 int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
index d09a06a4464e46d79c63b1b027226ab62ab28118..7575d3f4190d8c0267ffbe009aea77ff57d2a947 100644 (file)
@@ -2008,7 +2008,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     x264_quant_function_t qf_c;
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
-    ALIGNED_ARRAY_32( dctcoef, dct1,[64] );
+    ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
     ALIGNED_ARRAY_32( dctcoef, dct2,[64] );
     ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
     ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
@@ -2631,7 +2631,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
             {\
                 for( int j = 0; j < 256; j++ )\
                 {\
-                    ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\
+                    ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
                     uint8_t bitstream[2][1<<16];\
                     static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
                     int ac = ctx_ac[ctx_block_cat];\