x86: AVX-512 sub16x16_dct

author Henrik Gramner <henrik@gramner.com>

Sat, 10 Jun 2017 14:01:53 +0000 (16:01 +0200)

committer Henrik Gramner <henrik@gramner.com>

Sat, 24 Jun 2017 14:23:50 +0000 (16:23 +0200)
author Henrik Gramner <henrik@gramner.com>
Sat, 10 Jun 2017 14:01:53 +0000 (16:01 +0200)
committer Henrik Gramner <henrik@gramner.com>
Sat, 24 Jun 2017 14:23:50 +0000 (16:23 +0200)
diff --git a/common/common.h b/common/common.h

index 333198383f5762d7346a69c9e2f1bde06251cc7d..867b2073c0c7968d5a5410e1faf6b67273221b21 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -778,7 +778,7 @@ struct x264_t
  #define FENC_STRIDE 16
  #define FDEC_STRIDE 32
              ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
-            ALIGNED_64( pixel fdec_buf[52*FDEC_STRIDE] );
+            ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
  
              /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
              ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
diff --git a/common/dct.c b/common/dct.c

index 3e7aa39a5ae5cd53374467fed0f13a211d2c01f1..0d7f96debe147df81f30c7ae78c55928a5787f82 100644 (file)
--- a/common/dct.c
+++ b/common/dct.c
@@ -716,6 +716,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
      {
          dctf->sub4x4_dct       = x264_sub4x4_dct_avx512;
          dctf->sub8x8_dct       = x264_sub8x8_dct_avx512;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_avx512;
      }
  #endif //HAVE_MMX
  
diff --git a/common/macroblock.c b/common/macroblock.c

index 8dc9f9757b612c36068336bfe0c2d39ee7b9e47e..6168671591da4c905aca2d82287c821a79c9e5ab 100644 (file)
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -532,16 +532,15 @@ void x264_macroblock_thread_init( x264_t *h )
      h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
      h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
      h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
+    h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
      if( CHROMA444 )
      {
          h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
-        h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
-        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
+        h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE;
      }
      else
      {
          h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
-        h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
          h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
      }
  }
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm

index 31897d9a670ea0b31c41b17e399c9906fa9c1780..42af7c6374654a7379432060ae319b09a4b1a05b 100644 (file)
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -47,10 +47,10 @@ cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4:
                     dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
                     dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
  %else
-dct_avx512:        dd 0x00000000, 0x00000104, 0x0000014c, 0x00000048 ; bits 0-4:   dct8x8_fenc
-                   dd 0x00000210, 0x00000314, 0x0000035c, 0x00000258 ; bits 5-9:   dct8x8_fdec
-                   dd 0x00000021, 0x00000125, 0x0000016d, 0x00000069
-                   dd 0x00000231, 0x00000335, 0x0000037d, 0x00000279
+dct_avx512:        dd 0x00000000, 0x00021104, 0x0006314c, 0x00042048 ; bits 0-4:   dct8x8_fenc
+                   dd 0x00008a10, 0x00029b14, 0x0006bb5c, 0x0004aa58 ; bits 5-9:   dct8x8_fdec
+                   dd 0x00004421, 0x00025525, 0x0006756d, 0x00046469 ; bits 10-13: dct16x16_fenc
+                   dd 0x0000ce31, 0x0002df35, 0x0006ff7d, 0x0004ee79 ; bits 14-18: dct16x16_fdec
  scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3:   4x4_frame
                     dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9:   8x8_frame1
                     dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
@@ -699,6 +699,31 @@ cglobal sub8x8_dct, 3,3
      mova     [r0], m0
      mova  [r0+64], m1
      RET
+
+%macro SUB4x16_DCT_AVX512 2 ; dst, src
+    vpermd   m1, m5, [r1+1*%2*64]
+    mova     m3,     [r2+2*%2*64]
+    vpermt2d m3, m6, [r2+2*%2*64+64]
+    call dct4x4x4_internal_avx512
+    mova [r0+%1*64    ], m0
+    mova [r0+%1*64+128], m1
+%endmacro
+
+cglobal sub16x16_dct
+    psrld    m5, [dct_avx512], 10
+    mov     eax, 0xaaaaaaaa
+    kmovd    k1, eax
+    mov     eax, 0xf0f0f0f0
+    kmovd    k2, eax
+    PROLOGUE 3,3
+    pxor    xm4, xm4
+    knotw    k3, k2
+    psrld    m6, m5, 4
+    SUB4x16_DCT_AVX512 0, 0
+    SUB4x16_DCT_AVX512 1, 1
+    SUB4x16_DCT_AVX512 4, 2
+    SUB4x16_DCT_AVX512 5, 3
+    RET
  %endif ; HIGH_BIT_DEPTH
  
  INIT_MMX
diff --git a/common/x86/dct.h b/common/x86/dct.h

index f605437a7c00d9d4338c21ae2d62d0a93a5d4f3e..c30b0daadaf99972cf6d602d3672cbf5ea0bbfd7 100644 (file)
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -44,6 +44,7 @@ void x264_sub16x16_dct_xop  ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
  void x264_sub8x8_dct_avx2   ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
  void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
  void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
  void x264_sub8x8_dct_dc_mmx2( int16_t dct    [ 4], uint8_t *pix1, uint8_t *pix2 );
  void x264_sub8x8_dct_dc_sse2( dctcoef dct    [ 4], pixel   *pix1, pixel   *pix2 );
  void x264_sub8x16_dct_dc_sse2 ( dctcoef dct  [ 4], pixel   *pix1, pixel   *pix2 );
author	Henrik Gramner <henrik@gramner.com>
	Sat, 10 Jun 2017 14:01:53 +0000 (16:01 +0200)
committer	Henrik Gramner <henrik@gramner.com>
	Sat, 24 Jun 2017 14:23:50 +0000 (16:23 +0200)
common/common.h		patch \| blob \| history
common/dct.c		patch \| blob \| history
common/macroblock.c		patch \| blob \| history
common/x86/dct-a.asm		patch \| blob \| history
common/x86/dct.h		patch \| blob \| history