]> granicus.if.org Git - libx264/commitdiff
Much faster CAVLC RDO and bitstream writing
authorFiona Glaser <fiona@x264.com>
Tue, 30 Dec 2008 03:12:17 +0000 (03:12 +0000)
committerLoren Merritt <pengvado@akuvian.org>
Tue, 30 Dec 2008 03:32:34 +0000 (03:32 +0000)
Pure asm version of level/run coding.  Over 2x faster than C.
Up to 40% faster CAVLC RDO.  Overall benefit up to ~7.5% with RDO or ~5% with fast encoding settings.

common/bs.h
common/quant.c
common/quant.h
common/x86/quant-a.asm
common/x86/quant.h
encoder/cavlc.c
tools/checkasm.c

index 9882c23c11319d97edbb1dc03245a389dd2da7ef..0765f50aa3ac0b9f2c2f22ce7989e1f13e644292 100644 (file)
@@ -50,6 +50,13 @@ typedef struct bs_s
     int     i_bits_encoded; /* RD only */
 } bs_t;
 
+typedef struct
+{
+    int     last;
+    int16_t level[16];
+    uint8_t run[16];
+} x264_run_level_t;
+
 extern const vlc_t x264_coeff0_token[5];
 extern const vlc_t x264_coeff_token[5][16*4];
 extern const vlc_t x264_total_zeros[15][16];
index ee7b9485c5750773598b472bc04af4cdd65b82a8..fa38360cc0d4e3aa610db3b040edf0856ef1a0e0 100644 (file)
@@ -273,6 +273,27 @@ static int x264_coeff_last64( int16_t *l )
     return x264_coeff_last_internal( l, 64 );
 }
 
+#define level_run(num)\
+static int x264_coeff_level_run##num( int16_t *dct, x264_run_level_t *runlevel )\
+{\
+    int i_last = runlevel->last = x264_coeff_last##num(dct);\
+    int i_total = 0;\
+    do\
+    {\
+        int r = 0;\
+        runlevel->level[i_total] = dct[i_last];\
+        while( --i_last >= 0 && dct[i_last] == 0 )\
+            r++;\
+        runlevel->run[i_total++] = r;\
+    } while( i_last >= 0 );\
+    return i_total;\
+}
+
+level_run(4)
+level_run(15)
+level_run(16)
+
+
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 {
     pf->quant_8x8 = quant_8x8;
@@ -293,6 +314,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15;
     pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
     pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
+    pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4;
+    pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
+    pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
 
 #ifdef HAVE_MMX
     if( cpu&X264_CPU_MMX )
@@ -323,8 +347,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[  DCT_LUMA_AC] = x264_coeff_last15_mmxext;
         pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
         pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
+        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
+        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
 #endif
         pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
+        pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
     }
 
     if( cpu&X264_CPU_SSE2 )
@@ -347,6 +374,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
         pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
+        pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
+        pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
     }
 
     if( cpu&X264_CPU_SSSE3 )
@@ -375,4 +404,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
 #endif
     pf->coeff_last[  DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
     pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
+    pf->coeff_level_run[  DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
+    pf->coeff_level_run[DCT_CHROMA_AC] = pf->coeff_level_run[ DCT_LUMA_AC];
 }
index dabd60cef4a047ce8ae8bfc6528e4bb2c357b49c..eaac5937c09f13efa07e7811b851ef2fcf2900ef 100644 (file)
@@ -40,6 +40,7 @@ typedef struct
     int (*decimate_score16)( int16_t *dct );
     int (*decimate_score64)( int16_t *dct );
     int (*coeff_last[6])( int16_t *dct );
+    int (*coeff_level_run[5])( int16_t *dct, x264_run_level_t *runlevel );
 } x264_quant_function_t;
 
 void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
index e80f9058e82011dfd6ed6854cf5b010bc8e10e68..5cbdf4a818efda8c1d3e40b19ddcba38eb700ff7 100644 (file)
@@ -662,6 +662,10 @@ INIT_XMM
 DECIMATE8x8 sse2
 DECIMATE8x8 ssse3
 
+;-----------------------------------------------------------------------------
+; int x264_coeff_last( int16_t *dct )
+;-----------------------------------------------------------------------------
+
 %macro LAST_MASK_SSE2 2-3
     movdqa   xmm0, [%2+ 0]
     pxor     xmm2, xmm2
@@ -766,3 +770,63 @@ COEFF_LAST mmxext
 %endif
 %define LAST_MASK LAST_MASK_SSE2
 COEFF_LAST sse2
+
+;-----------------------------------------------------------------------------
+; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
+;-----------------------------------------------------------------------------
+
+%macro LAST_MASK4_MMX 2-3
+    movq     mm0, [%2]
+    pxor     mm2, mm2
+    packsswb mm0, mm0
+    pcmpeqb  mm0, mm2
+    pmovmskb  %1, mm0
+%endmacro
+
+; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
+%ifdef ARCH_X86_64
+    DECLARE_REG_TMP 0,1,2,3,4,5,6
+%else
+    DECLARE_REG_TMP 6,3,2,1,4,5,0
+%endif
+
+%macro COEFF_LEVELRUN 2
+cglobal x264_coeff_level_run%2_%1,0,7
+    movifnidn t0d, r0m
+    movifnidn t1d, r1m
+    LAST_MASK t2d, t0-(%2&1)*2, t4d
+    not    t2d
+    shl    t2d, 32-((%2+1)&~1)
+    mov    t4d, %2-1
+    mov    t5d, t2d
+    bsr    t3d, t2d
+    xor    t6d, t6d
+    shl    t5d, 1
+    xor    t3d, 0x1f
+    sub    t4d, t3d
+    shl    t5d, t3b
+    mov   [t1], t4d
+.loop:
+    bsr    t3d, t5d
+    xor    t3d, 0x1f
+    mov    t2w, [t0+t4*2]
+    mov   [t1+t6  +36], t3b
+    mov   [t1+t6*2+ 4], t2w
+    inc    t3d
+    shl    t5d, t3b
+    inc    t6d
+    sub    t4d, t3d
+    jge .loop
+    RET
+%endmacro
+
+%ifndef ARCH_X86_64
+%define LAST_MASK LAST_MASK_MMX
+COEFF_LEVELRUN mmxext, 15
+COEFF_LEVELRUN mmxext, 16
+%endif
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext, 4
+%define LAST_MASK LAST_MASK_SSE2
+COEFF_LEVELRUN sse2, 15
+COEFF_LEVELRUN sse2, 16
index 8e9dbb65301b38674896699923b8ee9f238aec2d..46186ceb567c0fe794f8c2e814207c4762f0b4e1 100644 (file)
@@ -64,5 +64,10 @@ int x264_coeff_last64_mmxext( int16_t *dct );
 int x264_coeff_last15_sse2( int16_t *dct );
 int x264_coeff_last16_sse2( int16_t *dct );
 int x264_coeff_last64_sse2( int16_t *dct );
+int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
 
 #endif
index 6abff739ddafd262757137b03e6c17b03155ab6a..4f4ff03371a373a4ec934b56d2c980276f40fe21 100644 (file)
@@ -96,7 +96,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, bs_t *s, int i_s
                 /* Weight highly against overflows. */
                 s->i_bits_encoded += 1000000;
 #else
-                x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile", i_level_code );
+                x264_log(h, X264_LOG_WARNING, "OVERFLOW levelcode=%d is only allowed in High Profile\n", i_level_code );
                 /* clip level, preserving sign */
                 i_level_code = (1<<12) - 2 + (i_level_code & 1);
 #endif
@@ -116,8 +116,8 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
 {
     static const uint8_t ct_index[17] = {0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,3};
     static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
-    int level[16], run[16];
-    int i_trailing, i_total_zero, i_last, i_suffix_length, i;
+    x264_run_level_t runlevel;
+    int i_trailing, i_total_zero, i_suffix_length, i;
     int i_total = 0;
     unsigned int i_sign;
     /* x264_mb_predict_non_zero_code return 0 <-> (16+16+1)>>1 = 16 */
@@ -129,32 +129,22 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
         return;
     }
 
-    i_last = h->quantf.coeff_last[i_ctxBlockCat](l);
-    i_total_zero = i_last + 1;
-
     /* level and run and total */
     /* set these to 2 to allow branchless i_trailing calculation */
-    level[1] = 2;
-    level[2] = 2;
-    do
-    {
-        int r = 0;
-        level[i_total] = l[i_last];
-        while( --i_last >= 0 && l[i_last] == 0 )
-            r++;
-        run[i_total++] = r;
-    } while( i_last >= 0 );
+    runlevel.level[1] = 2;
+    runlevel.level[2] = 2;
+    i_total = h->quantf.coeff_level_run[i_ctxBlockCat]( l, &runlevel );
+    i_total_zero = runlevel.last + 1 - i_total;
 
     h->mb.cache.non_zero_count[x264_scan8[i_idx]] = i_total;
 
-    i_total_zero -= i_total;
-    i_trailing = ((((level[0]+1) | (1-level[0])) >> 31) & 1) // abs(level[0])>1
-               | ((((level[1]+1) | (1-level[1])) >> 31) & 2)
-               | ((((level[2]+1) | (1-level[2])) >> 31) & 4);
+    i_trailing = ((((runlevel.level[0]+1) | (1-runlevel.level[0])) >> 31) & 1) // abs(runlevel.level[0])>1
+               | ((((runlevel.level[1]+1) | (1-runlevel.level[1])) >> 31) & 2)
+               | ((((runlevel.level[2]+1) | (1-runlevel.level[2])) >> 31) & 4);
     i_trailing = ctz_index[i_trailing];
-    i_sign = ((level[2] >> 31) & 1)
-           | ((level[1] >> 31) & 2)
-           | ((level[0] >> 31) & 4);
+    i_sign = ((runlevel.level[2] >> 31) & 1)
+           | ((runlevel.level[1] >> 31) & 2)
+           | ((runlevel.level[0] >> 31) & 4);
     i_sign >>= 3-i_trailing;
 
     /* total/trailing */
@@ -166,10 +156,10 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
 
     if( i_trailing < i_total )
     {
-        int16_t val = level[i_trailing];
-        int16_t val_original = level[i_trailing]+LEVEL_TABLE_SIZE/2;
+        int16_t val = runlevel.level[i_trailing];
+        int16_t val_original = runlevel.level[i_trailing]+LEVEL_TABLE_SIZE/2;
         if( i_trailing < 3 )
-            val -= (val>>15)|1; /* as level[i] can't be 1 for the first one if i_trailing < 3 */
+            val -= (val>>15)|1; /* as runlevel.level[i] can't be 1 for the first one if i_trailing < 3 */
         val += LEVEL_TABLE_SIZE/2;
 
         if( (unsigned)val_original < LEVEL_TABLE_SIZE )
@@ -181,7 +171,7 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
             i_suffix_length = block_residual_write_cavlc_escape( h, s, i_suffix_length, val-LEVEL_TABLE_SIZE/2 );
         for( i = i_trailing+1; i < i_total; i++ )
         {
-            val = level[i] + LEVEL_TABLE_SIZE/2;
+            val = runlevel.level[i] + LEVEL_TABLE_SIZE/2;
             if( (unsigned)val < LEVEL_TABLE_SIZE )
             {
                 bs_write_vlc( s, x264_level_token[i_suffix_length][val] );
@@ -203,8 +193,8 @@ static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_ctxBlockCat, i
     for( i = 0; i < i_total-1 && i_total_zero > 0; i++ )
     {
         int i_zl = X264_MIN( i_total_zero - 1, 6 );
-        bs_write_vlc( s, x264_run_before[i_zl][run[i]] );
-        i_total_zero -= run[i];
+        bs_write_vlc( s, x264_run_before[i_zl][runlevel.run[i]] );
+        i_total_zero -= runlevel.run[i];
     }
 }
 
index e98961c40de490db8894e3f8eea6b64ea587f4ee..85658b77bae66643775f1e12daeecd04ccf2256d 100644 (file)
@@ -1127,7 +1127,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     ok = oks[1]; used_asm = used_asms[1];
     report( "dequant :" );
 
-    ok = 1;
+    ok = 1; used_asm = 0;
     if( qf_a.denoise_dct != qf_ref.denoise_dct )
     {
         int size;
@@ -1160,21 +1160,18 @@ static int check_quant( int cpu_ref, int cpu_new )
                 dct1[idx] = !(rand()&3) + (!(rand()&15))*(rand()&3); \
             if( ac ) \
                 dct1[0] = 0; \
-            memcpy( dct2, dct1, w*w*2 ); \
-            result_c = call_c1( qf_c.decname, (void*)dct2 ); \
-            result_a = call_a1( qf_a.decname, (void*)dct2 ); \
+            result_c = call_c( qf_c.decname, (void*)dct1 ); \
+            result_a = call_a( qf_a.decname, (void*)dct1 ); \
             if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #decname ": [FAILED]\n" ); \
                 break; \
             } \
-            call_c2( qf_c.decname, (void*)dct2 ); \
-            call_a2( qf_a.decname, (void*)dct2 ); \
         } \
     }
 
-    ok = 1;
+    ok = 1; used_asm = 0;
     TEST_DECIMATE( decimate_score64, 8, 0, 6 );
     TEST_DECIMATE( decimate_score16, 4, 0, 6 );
     TEST_DECIMATE( decimate_score15, 4, 1, 7 );
@@ -1194,27 +1191,60 @@ static int check_quant( int cpu_ref, int cpu_new )
                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
             if( !nnz ) \
                 dct1[ac] = 1; \
-            memcpy( dct2, dct1, w*w*2 ); \
-            result_c = call_c1( qf_c.last, (void*)(dct2+ac) ); \
-            result_a = call_a1( qf_a.last, (void*)(dct2+ac) ); \
+            result_c = call_c( qf_c.last, (void*)(dct1+ac) ); \
+            result_a = call_a( qf_a.last, (void*)(dct1+ac) ); \
             if( result_c != result_a ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #lastname ": [FAILED]\n" ); \
                 break; \
             } \
-            call_c2( qf_c.last, (void*)(dct2+ac) ); \
-            call_a2( qf_a.last, (void*)(dct2+ac) ); \
         } \
     }
 
-    ok = 1;
+    ok = 1; used_asm = 0;
     TEST_LAST( coeff_last[DCT_CHROMA_DC],  coeff_last4, 2, 0 );
     TEST_LAST( coeff_last[  DCT_LUMA_AC], coeff_last15, 4, 1 );
     TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
     TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
     report( "coeff_last :" );
 
+#define TEST_LEVELRUN( lastname, name, w, ac ) \
+    if( qf_a.lastname != qf_ref.lastname ) \
+    { \
+        set_func_name( #name ); \
+        used_asm = 1; \
+        for( i = 0; i < 100; i++ ) \
+        { \
+            x264_run_level_t runlevel_c, runlevel_a; \
+            int result_c, result_a, idx, nnz=0; \
+            int max = rand() & (w*w-1); \
+            memset( dct1, 0, w*w*2 ); \
+            memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
+            memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
+            for( idx = ac; idx < max; idx++ ) \
+                nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
+            if( !nnz ) \
+                dct1[ac] = 1; \
+            result_c = call_c( qf_c.lastname, (void*)(dct1+ac), &runlevel_c ); \
+            result_a = call_a( qf_a.lastname, (void*)(dct1+ac), &runlevel_a ); \
+            if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
+                memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
+                memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
+            { \
+                ok = 0; \
+                fprintf( stderr, #name ": [FAILED]\n" ); \
+                break; \
+            } \
+        } \
+    }
+
+    ok = 1; used_asm = 0;
+    TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC],  coeff_level_run4, 2, 0 );
+    TEST_LEVELRUN( coeff_level_run[  DCT_LUMA_AC], coeff_level_run15, 4, 1 );
+    TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 );
+    report( "coeff_level_run :" );
+
     return ret;
 }