From: Henrik Gramner Date: Sun, 26 Mar 2017 09:34:18 +0000 (+0200) Subject: x86: AVX-512 zigzag_scan_8x8_field X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=edb22f57ba03718c1cb9781ba005aec20a1e50e0;p=libx264 x86: AVX-512 zigzag_scan_8x8_field --- diff --git a/common/dct.c b/common/dct.c index 1be89350..cd263c92 100644 --- a/common/dct.c +++ b/common/dct.c @@ -990,6 +990,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig { pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; } #endif // HAVE_MMX @@ -1036,6 +1037,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig { pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; } #endif // HAVE_MMX diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 5a4f316a..ec696f49 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -38,11 +38,19 @@ scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2 dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3 ; bits 19-23: 8x8_frame4 +scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1 + dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2 + dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3 + dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4 %else scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1 dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2 dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30 +scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1 + dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2 + dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a + dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2 %endif pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 @@ -1943,6 +1951,29 @@ cglobal zigzag_scan_8x8_frame, 2,2 vmovdqa64 m2 {k2}, m3 mova [r0+3*64], m2 RET + +cglobal zigzag_scan_8x8_field, 2,2 + mova m0, [scan_field_avx512] + mova m1, [r1+0*64] + mova m2, [r1+1*64] + mova m3, [r1+2*64] + mova m4, [r1+3*64] + mov r1d, 0x3f + kmovb k1, r1d + psrld m5, m0, 5 + vpermi2d m0, m1, m2 + vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15 + vpermt2d m1, m5, m2 + psrld m5, 5 + vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31 + vpermt2d m2, m5, m3 + psrld m5, 5 + vpermt2d m3, m5, m4 + mova [r0+0*64], m0 + mova [r0+1*64], m1 + mova [r0+2*64], m2 + mova [r0+3*64], m3 + RET %else ; !HIGH_BIT_DEPTH INIT_YMM avx512 cglobal zigzag_scan_4x4_frame, 2,2 @@ -1961,6 +1992,7 @@ cglobal zigzag_scan_4x4_field, 2,2 INIT_ZMM avx512 cglobal zigzag_scan_8x8_frame, 2,2 psrlw m0, [scan_frame_avx512], 4 +scan8_avx512: mova m1, [r1] mova m2, [r1+64] psrlw m3, m0, 6 @@ -1969,4 +2001,8 @@ cglobal zigzag_scan_8x8_frame, 2,2 mova [r0], m0 mova [r0+64], m1 RET + +cglobal zigzag_scan_8x8_field, 2,2 + mova m0, [scan_field_avx512] + jmp scan8_avx512 %endif ; !HIGH_BIT_DEPTH diff --git a/common/x86/dct.h b/common/x86/dct.h index 1a5c75c4..a11a6dcd 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -116,10 +116,11 @@ void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] ); void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] ); void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] ); void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] ); -void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] ); -void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] ); -void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] ); +void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] ); +void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] ); int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );