]> granicus.if.org Git - libx264/commitdiff
AVX mbtree_propagate
authorFiona Glaser <fiona@x264.com>
Tue, 17 May 2011 21:50:51 +0000 (14:50 -0700)
committerFiona Glaser <fiona@x264.com>
Wed, 15 Jun 2011 02:43:11 +0000 (19:43 -0700)
Up to ~20-30% faster than SSE2 on Sandy Bridge.

common/frame.c
common/macroblock.c
common/x86/mc-a2.asm
common/x86/mc-c.c
tools/checkasm.c

index 40b90bb4ba9297a4ff949101cdb365085ca5e8ac..759e241123a87d1a4b39e57088fb988d09699488 100644 (file)
@@ -179,7 +179,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
                     CHECKED_MALLOCZERO( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
                     CHECKED_MALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
                 }
-            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+3) * sizeof(uint16_t) );
+            CHECKED_MALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
             for( int j = 0; j <= h->param.i_bframe+1; j++ )
                 for( int i = 0; i <= h->param.i_bframe+1; i++ )
                     CHECKED_MALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
index 891cf999920ba95a516e45f34f65cdf875fa9e28..8ca8eca163de86d71aa51a4bf5c52020fc68eff4 100644 (file)
@@ -342,7 +342,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
             ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
         scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
     }
-    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+3)&~3) * sizeof(int);
+    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
     scratch_size = X264_MAX( scratch_size, buf_mbtree );
     if( scratch_size )
         CHECKED_MALLOC( h->scratch_buffer, scratch_size );
index eb757858af7fda23e4da0fab42234ccbd0e3cb0f..a448ebaec239adc47fba304805c4d4881fa93e8a 100644 (file)
@@ -40,7 +40,7 @@ deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
-pf_inv256: times 4 dd 0.00390625
+pf_inv256: times 8 dd 0.00390625
 
 pad10: times 8 dw    10*PIXEL_MAX
 pad20: times 8 dw    20*PIXEL_MAX
@@ -1630,7 +1630,7 @@ FRAME_INIT_LOWRES ssse3
 ;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 ;-----------------------------------------------------------------------------
 cglobal mbtree_propagate_cost_sse2, 7,7,7
-    shl        r6d, 1
+    add        r6d, r6d
     lea         r0, [r0+r6*2]
     add         r1, r6
     add         r2, r6
@@ -1673,3 +1673,49 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
     jl .loop
     REP_RET
 
+%macro INT16_TO_FLOAT 1
+    vpunpckhwd   xmm4, xmm%1, xmm7
+    vpunpcklwd  xmm%1, xmm7
+    vinsertf128 ymm%1, ymm%1, xmm4, 1
+    vcvtdq2ps   ymm%1, ymm%1
+%endmacro
+
+; FIXME: align loads/stores to 16 bytes
+cglobal mbtree_propagate_cost_avx, 7,7,8
+    add           r6d, r6d
+    lea            r0, [r0+r6*2]
+    add            r1, r6
+    add            r2, r6
+    add            r3, r6
+    add            r4, r6
+    neg            r6
+    vmovdqa      xmm5, [pw_3fff]
+    vbroadcastss ymm6, [r5]
+    vmulps       ymm6, ymm6, [pf_inv256]
+    vpxor        xmm7, xmm7
+.loop:
+    vmovdqu      xmm0, [r2+r6]       ; intra
+    vmovdqu      xmm1, [r4+r6]       ; invq
+    vmovdqu      xmm2, [r1+r6]       ; prop
+    vpand        xmm3, xmm5, [r3+r6] ; inter
+    INT16_TO_FLOAT 0
+    INT16_TO_FLOAT 1
+    INT16_TO_FLOAT 2
+    INT16_TO_FLOAT 3
+    vmulps       ymm1, ymm1, ymm0
+    vsubps       ymm4, ymm0, ymm3
+    vmulps       ymm1, ymm1, ymm6    ; intra*invq*fps_factor>>8
+    vaddps       ymm1, ymm1, ymm2    ; prop + (intra*invq*fps_factor>>8)
+    vrcpps       ymm3, ymm0          ; 1 / intra 1st approximation
+    vmulps       ymm2, ymm0, ymm3    ; intra * (1/intra 1st approx)
+    vmulps       ymm2, ymm2, ymm3    ; intra * (1/intra 1st approx)^2
+    vmulps       ymm1, ymm1, ymm4    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+    vaddps       ymm3, ymm3, ymm3    ; 2 * (1/intra 1st approx)
+    vsubps       ymm3, ymm3, ymm2    ; 2nd approximation for 1/intra
+    vmulps       ymm1, ymm1, ymm3    ; / intra
+    vcvtps2dq    ymm1, ymm1
+    vmovdqu [r0+r6*2], ymm1
+    add            r6, 16
+    jl .loop
+    vzeroupper
+    RET
index 09e90c7835637a688f7167cf83a40c7f4fd55611..0f88be382ce47546043574c5ee3b8231a7308334 100644 (file)
@@ -140,6 +140,8 @@ void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
 void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
 void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                     uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
@@ -728,4 +730,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_avx;
 #endif // HIGH_BIT_DEPTH
+
+    if( !(cpu&X264_CPU_AVX) )
+        return;
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
 }
index dc22411757a2801967f86582bae57280ac442c5f..88ae68f78aca63971243f769757bc33ebfc27cc7 100644 (file)
@@ -1255,8 +1255,8 @@ static int check_mc( int cpu_ref, int cpu_new )
             int *dstc = dsta+400;
             uint16_t *prop = (uint16_t*)buf1;
             uint16_t *intra = (uint16_t*)buf4;
-            uint16_t *inter = intra+100;
-            uint16_t *qscale = inter+100;
+            uint16_t *inter = intra+128;
+            uint16_t *qscale = inter+128;
             uint16_t *rnd = (uint16_t*)buf2;
             x264_emms();
             for( int j = 0; j < 100; j++ )