x86-64: BMI2 cabac_residual functions

author Fiona Glaser <fiona@x264.com>

Mon, 25 Mar 2013 21:03:37 +0000 (14:03 -0700)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Apr 2013 21:36:39 +0000 (14:36 -0700)
author Fiona Glaser <fiona@x264.com>
Mon, 25 Mar 2013 21:03:37 +0000 (14:03 -0700)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Apr 2013 21:36:39 +0000 (14:36 -0700)
diff --git a/common/bitstream.c b/common/bitstream.c

index cecd5f7f5a3fa85278d5bcfd2e726f14b72e21f2..2c2ca37fc3363c58f762b1a0d802c613e6bd49b5 100644 (file)
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -52,6 +52,7 @@ void x264_cabac_block_residual_8x8_rd_internal_ssse3      ( dctcoef *l, int b_in
  void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
  void x264_cabac_block_residual_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
  void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
  
  /****************************************************************************
   * x264_nal_encode:
@@ -136,6 +137,8 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
      if( cpu&X264_CPU_AVX2 )
      {
          pf->nal_escape = x264_nal_escape_avx2;
+        if( cpu&X264_CPU_BMI2 )
+            pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
      }
  #endif
  #endif
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm

index 7fcd84499b7278606c1f02c3c9b7bf21fd0e929e..1d0c0a026283c84b5797609cb258bdb9852ced67 100644 (file)
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -36,14 +36,17 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
                              db 4, 4, 4, 4, 5, 6, 7, 7
  
  %if ARCH_X86_64
-%macro COEFF_LAST_TABLE 16
+%macro COEFF_LAST_TABLE 17
      %define funccpu1 %1
      %define funccpu2 %2
+    %define funccpu3 %3
      %rep 14
-        %ifidn %3, 4
-            dq mangle(x264_coeff_last%3_ %+ funccpu1)
+        %ifidn %4, 4
+            dq mangle(x264_coeff_last%4_ %+ funccpu1)
+        %elifidn %4, 64
+            dq mangle(x264_coeff_last%4_ %+ funccpu2)
          %else
-            dq mangle(x264_coeff_last%3_ %+ funccpu2)
+            dq mangle(x264_coeff_last%4_ %+ funccpu3)
          %endif
          %rotate 1
      %endrep
@@ -57,9 +60,11 @@ cextern coeff_last16_sse2
  cextern coeff_last16_sse2_lzcnt
  cextern coeff_last64_sse2
  cextern coeff_last64_sse2_lzcnt
+cextern coeff_last64_avx2_lzcnt
  
-coeff_last_sse2:       COEFF_LAST_TABLE       mmx2,       sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
-coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2:       COEFF_LAST_TABLE       mmx2,       sse2,       sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
  %endif
  
  SECTION .text
@@ -78,15 +83,9 @@ cextern coeff_abs_level_m1_offset
  cextern count_cat_m1
  cextern cabac_encode_ue_bypass
  
-; t3 must be ecx, since it's used for shift.
-%if WIN64
-    DECLARE_REG_TMP 3,1,2,0,5,6,4,4
-    %define pointer resq
-%elif ARCH_X86_64
-    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%if ARCH_X86_64
      %define pointer resq
  %else
-    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
      %define pointer resd
  %endif
  
@@ -116,7 +115,17 @@ endstruc
  %endif
  %endmacro
  
-cglobal cabac_encode_decision_asm, 1,7
+%macro CABAC 1
+; t3 must be ecx, since it's used for shift.
+%if WIN64
+    DECLARE_REG_TMP 3,1,2,0,5,6,4,4
+%elif ARCH_X86_64
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,6
+%else
+    DECLARE_REG_TMP 0,4,2,1,3,5,6,2
+%endif
+
+cglobal cabac_encode_decision_%1, 1,7
      movifnidn t1d, r1m
      mov   t5d, [r0+cb.range]
      movzx t6d, byte [r0+cb.state+t1]
@@ -144,22 +153,29 @@ cglobal cabac_encode_decision_asm, 1,7
      mov   [t0+cb.state+t1], t4b
  ;cabac_encode_renorm
      mov   t4d, t3d
+%ifidn %1, bmi2
+    lzcnt t3d, t3d
+    sub   t3d, 23
+    shlx  t4d, t4d, t3d
+    shlx  t6d, t6d, t3d
+%else
      shr   t3d, 3
      LOAD_GLOBAL t3d, cabac_renorm_shift, t3
+    shl   t4d, t3b
+    shl   t6d, t3b
+%endif
  %if WIN64
      POP r7
  %endif
-    shl   t4d, t3b
-    shl   t6d, t3b
      mov   [t0+cb.range], t4d
      add   t3d, [t0+cb.queue]
-    jge cabac_putbyte
+    jge cabac_putbyte_%1
  .update_queue_low:
      mov   [t0+cb.low], t6d
      mov   [t0+cb.queue], t3d
      RET
  
-cglobal cabac_encode_bypass_asm, 2,3
+cglobal cabac_encode_bypass_%1, 2,3
      mov       t7d, [r0+cb.low]
      and       r1d, [r0+cb.range]
      lea       t7d, [t7*2+r1]
@@ -167,7 +183,7 @@ cglobal cabac_encode_bypass_asm, 2,3
      mov       t3d, [r0+cb.queue]
      inc       t3d
  %if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
-    jge cabac_putbyte
+    jge cabac_putbyte_%1
  %else
      jge .putbyte
  %endif
@@ -178,10 +194,11 @@ cglobal cabac_encode_bypass_asm, 2,3
  .putbyte:
      PROLOGUE 0,7
      movifnidn t6d, t7d
-    jmp cabac_putbyte
+    jmp cabac_putbyte_%1
  %endif
  
-cglobal cabac_encode_terminal_asm, 1,3
+%ifnidn %1,bmi2
+cglobal cabac_encode_terminal_%1, 1,3
      sub  dword [r0+cb.range], 2
  ; shortcut: the renormalization shift in terminal
  ; can only be 0 or 1 and is zero over 99% of the time.
@@ -199,12 +216,19 @@ cglobal cabac_encode_terminal_asm, 1,3
      movifnidn t0, r0 ; WIN64
      mov t3d, [r0+cb.queue]
      mov t6d, [t0+cb.low]
+%endif
  
-cabac_putbyte:
+cabac_putbyte_%1:
      ; alive: t0=cb t3=queue t6=low
  %if WIN64
      DECLARE_REG_TMP 3,6,1,0,2,5,4
  %endif
+%ifidn %1, bmi2
+    add   t3d, 10
+    shrx  t2d, t6d, t3d
+    bzhi  t6d, t6d, t3d
+    sub   t3d, 18
+%else
      mov   t1d, -1
      add   t3d, 10
      mov   t2d, t6d
@@ -213,6 +237,7 @@ cabac_putbyte:
      not   t1d
      sub   t3d, 18
      and   t6d, t1d
+%endif
      mov   t5d, [t0+cb.bytes_outstanding]
      cmp   t2b, 0xff ; FIXME is a 32bit op faster?
      jz    .postpone
@@ -229,7 +254,11 @@ cabac_putbyte:
  .postpone:
      inc   t5d
      mov   [t0+cb.bytes_outstanding], t5d
-    jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
+    jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
+%endmacro
+
+CABAC asm
+CABAC bmi2
  
  ; %1 = label name
  ; %2 = node_ctx init?
@@ -514,7 +543,11 @@ CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
  ;-----------------------------------------------------------------------------
  
  %macro CALL_CABAC 0
+%if cpuflag(bmi2)
+    call cabac_encode_decision_bmi2
+%else
      call cabac_encode_decision_asm
+%endif
  %if WIN64 ; move cabac back
      mov r0, r3
  %endif
@@ -696,7 +729,11 @@ cglobal cabac_block_residual_internal, 4,15
      movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
  .level_sign:
      mov     r1d, r11d
+%if cpuflag(bmi2)
+    call cabac_encode_bypass_bmi2
+%else
      call cabac_encode_bypass_asm
+%endif
  %if WIN64
      mov      r0, r3
  %endif
@@ -711,4 +748,6 @@ INIT_XMM sse2
  CABAC_RESIDUAL coeff_last_sse2
  INIT_XMM sse2,lzcnt
  CABAC_RESIDUAL coeff_last_sse2_lzcnt
+INIT_XMM avx2,bmi2
+CABAC_RESIDUAL coeff_last_avx2_lzcnt
  %endif
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm

index c7c4c90f79ea7c7a27f413016dec86a7f5a8f6d3..f0e816d7be20c6a8846f3498504059e09251f262 100644 (file)
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -742,9 +742,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
  %assign cpuflags_misalign (1<<20)
  %assign cpuflags_aligned  (1<<21) ; not a cpu feature, but a function variant
  %assign cpuflags_atom     (1<<22)
-%assign cpuflags_bmi1     (1<<23)
+%assign cpuflags_bmi1     (1<<23)|cpuflags_lzcnt
  %assign cpuflags_bmi2     (1<<24)|cpuflags_bmi1
-%assign cpuflags_tbm      (1<<25)|cpuflags_bmi1
  
  %define    cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
  %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
diff --git a/encoder/cabac.c b/encoder/cabac.c

index 1a9a734e978405901f5c58bf5ae869b5b177dcf4..0a14ecdf9b7fd55070ae208a45a61b841d4784b7 100644 (file)
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -793,12 +793,18 @@ static ALWAYS_INLINE void x264_cabac_block_residual_internal( x264_t *h, x264_ca
          x264_cabac_encode_bypass( cb, coeff_sign );
      } while( --coeff_idx >= 0 );
  }
-static void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
+}
+
+static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
  {
  #if ARCH_X86_64 && HAVE_MMX
      h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
  #else
-    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
+    x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
  #endif
  }
  static void x264_cabac_block_residual_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
diff --git a/encoder/encoder.c b/encoder/encoder.c

index e239de0e20af4bfe91b5bf2c8029dd3082c48084..7a0c506a473a16158c9857bfe3521b75f1c23837 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1334,6 +1334,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
          if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
              && (h->param.cpu & X264_CPU_SSE42) )
              continue;
+        if( !strcmp(x264_cpu_names[i].name, "BMI1")
+            && (h->param.cpu & X264_CPU_BMI2) )
+            continue;
          if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
              && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
              p += sprintf( p, " %s", x264_cpu_names[i].name );
diff --git a/tools/checkasm.c b/tools/checkasm.c

index 1b02255bb750dbc0d2e580ec01802d06c96f1e11..7a2f6d4d51e1d2a388b9bccc29cfc97b3b77708f 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -28,7 +28,6 @@
  #include <ctype.h>
  #include "common/common.h"
  #include "common/cpu.h"
-#include "encoder/cabac.c"
  
  // GCC doesn't align stack variables on ARM, so use .bss
  #if ARCH_ARM
@@ -2318,21 +2317,8 @@ DECL_CABAC(asm)
  #define run_cabac_terminal_asm run_cabac_terminal_c
  #endif
  
-static void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
-{
-    x264_cabac_block_residual_internal( h, cb, ctx_block_cat, l, 0 );
-    cb->p = cb->p_start;
-}
-
-/* Wrapper to roll back the pointer to avoid running out of memory bounds during
- * benchmark repetitions.  Introduces slight bias into the test, but not too much. */
-static void x264_cabac_block_residual_asm( void (*c)( dctcoef *, int, intptr_t, x264_cabac_t * ),
-                                           dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb )
-{
-    c( l, b_interlaced, ctx_block_cat, cb );
-    cb->p = cb->p_start;
-}
-
+extern const uint8_t x264_count_cat_m1[14];
+void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
  void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
  void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
  
@@ -2351,10 +2337,8 @@ static int check_cabac( int cpu_ref, int cpu_new )
  
  #define CABAC_RESIDUAL(name, start, end, rd)\
  {\
-    static int cabac_checked = 0;\
-    if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || ((cpu_new&X264_CPU_SSE2) && !cabac_checked)) )\
+    if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
      {\
-        cabac_checked = 1;\
          used_asm = 1;\
          set_func_name( #name );\
          for( int i = 0; i < 2; i++ )\
@@ -2363,7 +2347,8 @@ static int check_cabac( int cpu_ref, int cpu_new )
              {\
                  for( int j = 0; j < 256; j++ )\
                  {\
-                    ALIGNED_ARRAY_16( dctcoef, dct, [2],[64] );\
+                    ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
+                    uint8_t bitstream[2][1<<16];\
                      static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
                      int ac = ctx_ac[ctx_block_cat];\
                      int nz = 0;\
@@ -2385,15 +2370,15 @@ static int check_cabac( int cpu_ref, int cpu_new )
                      x264_cabac_t cb[2];\
                      x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
                      x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
-                    x264_cabac_encode_init( &cb[0], buf3, buf3+0x3f0 );\
-                    x264_cabac_encode_init( &cb[1], buf4, buf4+0x3f0 );\
+                    x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
+                    x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
                      cb[0].f8_bits_encoded = 0;\
                      cb[1].f8_bits_encoded = 0;\
-                    if( !rd ) memcpy( buf4, buf3, 0x400 );\
+                    if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
                      call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
                      call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
                      ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
-                    if( !rd ) ok |= !memcmp( buf3, buf4, 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
+                    if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
                      if( !ok )\
                      {\
                          fprintf( stderr, #name " :  [FAILED] ctx_block_cat %d", (int)ctx_block_cat );\
@@ -2402,9 +2387,11 @@ static int check_cabac( int cpu_ref, int cpu_new )
                          fprintf( stderr, "\n");\
                          goto name##fail;\
                      }\
-                    call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
-                    if( rd ) call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
-                    else     call_a2( x264_cabac_block_residual_asm, bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+                    if( (j&15) == 0 )\
+                    {\
+                        call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
+                        call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
+                    }\
                  }\
              }\
          }\
@@ -2415,13 +2402,14 @@ name##fail:
      CABAC_RESIDUAL( cabac_block_residual, 0, DCT_LUMA_8x8, 0 )
      report( "cabac residual:" );
  
+    ok = 1; used_asm = 0;
      CABAC_RESIDUAL( cabac_block_residual_rd, 0, DCT_LUMA_8x8-1, 1 )
      CABAC_RESIDUAL( cabac_block_residual_8x8_rd, DCT_LUMA_8x8, DCT_LUMA_8x8, 1 )
      report( "cabac residual rd:" );
  
      if( cpu_ref || run_cabac_decision_c == run_cabac_decision_asm )
          return ret;
-    used_asm = 1;
+    ok = 1; used_asm = 0;
      x264_cabac_init( &h );
  
      set_func_name( "cabac_encode_decision" );
@@ -2602,15 +2590,15 @@ static int check_all_flags( void )
      if( x264_cpu_detect() & X264_CPU_BMI1 )
      {
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
-        if( x264_cpu_detect() & X264_CPU_BMI2 )
-        {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
-            cpu1 &= ~X264_CPU_BMI2;
-        }
          cpu1 &= ~X264_CPU_BMI1;
      }
      if( x264_cpu_detect() & X264_CPU_AVX2 )
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+    if( x264_cpu_detect() & X264_CPU_BMI2 )
+    {
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
+        cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
+    }
      if( x264_cpu_detect() & X264_CPU_FMA3 )
      {
          ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
author	Fiona Glaser <fiona@x264.com>
	Mon, 25 Mar 2013 21:03:37 +0000 (14:03 -0700)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Apr 2013 21:36:39 +0000 (14:36 -0700)
common/bitstream.c		patch \| blob \| history
common/x86/cabac-a.asm		patch \| blob \| history
common/x86/x86inc.asm		patch \| blob \| history
encoder/cabac.c		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history