]> granicus.if.org Git - libx264/commitdiff
x86: AVX2 plane_copy_deinterleave
authorHenrik Gramner <henrik@gramner.com>
Tue, 17 Jan 2017 20:59:47 +0000 (21:59 +0100)
committerHenrik Gramner <henrik@gramner.com>
Sat, 21 Jan 2017 13:10:37 +0000 (14:10 +0100)
50% faster than SSSE3 in 8-bit.
25% faster than AVX in high bit-depth.

Also drop the MMX versions of deinterleave functions in favor of SSE2.

common/x86/mc-a2.asm
common/x86/mc-c.c
encoder/encoder.c
tools/checkasm.c

index 07ecc7451543f01e577bb49f97367b65521aef06..f39645a16697e2f0eebe48f5816ab3a81fe89901 100644 (file)
@@ -37,24 +37,23 @@ filt_mul20: times 32 db 20
 filt_mul15: times 16 db 1, -5
 filt_mul51: times 16 db -5, 1
 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
-deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
 %if HIGH_BIT_DEPTH
-copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 v210_mask: times 4 dq 0xc00ffc003ff003ff
 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
            dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
-
+copy_swap_shuf:       SHUFFLE_MASK_W 1,0,3,2,5,4,7,6
+deinterleave_shuf:    SHUFFLE_MASK_W 0,2,4,6,1,3,5,7
 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
 %else
-copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
 deinterleave_rgb_shuf: db  0, 3, 6, 9, 0, 3, 6, 9, 1, 4, 7,10, 2, 5, 8,11
                        db  0, 4, 8,12, 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14
-
+copy_swap_shuf:        db  1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
+deinterleave_shuf:     db  0, 2, 4, 6, 8,10,12,14, 1, 3, 5, 7, 9,11,13,15
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
 %endif ; !HIGH_BIT_DEPTH
@@ -940,7 +939,11 @@ HPEL
 %macro PLANE_COPY_CORE 1 ; swap
 %if %1
 cglobal plane_copy_swap_core, 6,7
+%if mmsize == 32
+    vbroadcasti128 m4, [copy_swap_shuf]
+%else
     mova   m4, [copy_swap_shuf]
+%endif
 %else
 cglobal plane_copy_core, 6,7
 %endif
@@ -1042,24 +1045,23 @@ PLANE_COPY_CORE 1
 %endmacro
 
 %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
-%if HIGH_BIT_DEPTH
-%assign n 0
-%rep 16/mmsize
-    mova     m0, [%3+(n+0)*mmsize]
-    mova     m1, [%3+(n+1)*mmsize]
+    mova     m0, [%3]
+%if mmsize == 32
+    pshufb   m0, %5
+    vpermq   m0, m0, q3120
+    mov%6  [%1], xm0
+    vextracti128 [%2], m0, 1
+%elif HIGH_BIT_DEPTH
+    mova     m1, [%3+mmsize]
     psrld    m2, m0, 16
     psrld    m3, m1, 16
     pand     m0, %5
     pand     m1, %5
     packssdw m0, m1
     packssdw m2, m3
-    mov%6    [%1+(n/2)*mmsize], m0
-    mov%6    [%2+(n/2)*mmsize], m2
-    %assign n (n+2)
-%endrep
+    mov%6  [%1], m0
+    mov%6  [%2], m2
 %else ; !HIGH_BIT_DEPTH
-%if mmsize==16
-    mova   m0, [%3]
 %if cpuflag(ssse3)
     pshufb m0, %5
 %else
@@ -1074,20 +1076,6 @@ PLANE_COPY_CORE 1
     movq   [%1], m0
     movhps [%2], m0
 %endif
-%else
-    mova   m0, [%3]
-    mova   m1, [%3+8]
-    mova   m2, m0
-    mova   m3, m1
-    pand   m0, %5
-    pand   m1, %5
-    psrlw  m2, 8
-    psrlw  m3, 8
-    packuswb m0, m1
-    packuswb m2, m3
-    mova   [%1], m0
-    mova   [%2], m2
-%endif ; mmsize == 16
 %endif ; HIGH_BIT_DEPTH
 %endmacro
 
@@ -1176,7 +1164,9 @@ cglobal store_interleave_chroma, 5,5
 %endmacro ; PLANE_INTERLEAVE
 
 %macro DEINTERLEAVE_START 0
-%if HIGH_BIT_DEPTH
+%if mmsize == 32
+    vbroadcasti128 m4, [deinterleave_shuf]
+%elif HIGH_BIT_DEPTH
     mova   m4, [pd_ffff]
 %elif cpuflag(ssse3)
     mova   m4, [deinterleave_shuf]
@@ -1191,31 +1181,44 @@ cglobal store_interleave_chroma, 5,5
 ;                               pixel *dstv, intptr_t i_dstv,
 ;                               pixel *src,  intptr_t i_src, int w, int h )
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
+cglobal plane_copy_deinterleave, 6,9
+%define %%w r7
+%define %%h r8d
+    mov    r8d, r7m
+%else
 cglobal plane_copy_deinterleave, 6,7
+%define %%w r6m
+%define %%h dword r7m
+%endif
+%if HIGH_BIT_DEPTH
+%assign %%n 16
+%else
+%assign %%n mmsize/2
+%endif
     DEINTERLEAVE_START
     mov    r6d, r6m
     FIX_STRIDES r1, r3, r5, r6d
-%if HIGH_BIT_DEPTH
-    mov    r6m, r6d
-%endif
     add    r0,  r6
     add    r2,  r6
     lea    r4, [r4+r6*2]
-.loopy:
-    mov    r6d, r6m
     neg    r6
-.loopx:
-    DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
-    DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
-    add    r6, 16*SIZEOF_PIXEL
-    jl .loopx
+    mov   %%w, r6
+.loop:
+    DEINTERLEAVE r0+r6,     r2+r6,     r4+r6*2,       0, m4, u
+    DEINTERLEAVE r0+r6+%%n, r2+r6+%%n, r4+r6*2+%%n*2, 0, m4, u
+    add    r6, %%n*2
+    jl .loop
     add    r0, r1
     add    r2, r3
     add    r4, r5
-    dec dword r7m
-    jg .loopy
+    mov    r6, %%w
+    dec   %%h
+    jg .loop
     RET
+%endmacro ; PLANE_DEINTERLEAVE
 
+%macro LOAD_DEINTERLEAVE_CHROMA 0
 ;-----------------------------------------------------------------------------
 ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
 ;-----------------------------------------------------------------------------
@@ -1245,7 +1248,7 @@ cglobal load_deinterleave_chroma_fdec, 4,4
     sub   r3d, 2
     jg .loop
     RET
-%endmacro ; PLANE_DEINTERLEAVE
+%endmacro ; LOAD_DEINTERLEAVE_CHROMA
 
 %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
 %if mmsize == 32
@@ -1414,33 +1417,31 @@ ALIGN 16
     RET
 %endmacro ; PLANE_DEINTERLEAVE_V210
 
-%if HIGH_BIT_DEPTH
 INIT_MMX mmx2
 PLANE_INTERLEAVE
-INIT_MMX mmx
-PLANE_DEINTERLEAVE
 INIT_XMM sse2
 PLANE_INTERLEAVE
 PLANE_DEINTERLEAVE
+LOAD_DEINTERLEAVE_CHROMA
+INIT_YMM avx2
+PLANE_DEINTERLEAVE
+
+%if HIGH_BIT_DEPTH
 INIT_XMM ssse3
 PLANE_DEINTERLEAVE_V210
 INIT_XMM avx
 PLANE_INTERLEAVE
 PLANE_DEINTERLEAVE
+LOAD_DEINTERLEAVE_CHROMA
 PLANE_DEINTERLEAVE_V210
 INIT_YMM avx2
 PLANE_DEINTERLEAVE_V210
 %else
-INIT_MMX mmx2
-PLANE_INTERLEAVE
-INIT_MMX mmx
-PLANE_DEINTERLEAVE
 INIT_XMM sse2
-PLANE_INTERLEAVE
-PLANE_DEINTERLEAVE
 PLANE_DEINTERLEAVE_RGB
 INIT_XMM ssse3
 PLANE_DEINTERLEAVE
+LOAD_DEINTERLEAVE_CHROMA
 PLANE_DEINTERLEAVE_RGB
 INIT_YMM avx2
 PLANE_DEINTERLEAVE_RGB
@@ -1947,7 +1948,7 @@ cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH,
     jg .hloop
 %else ; !HIGH_BIT_DEPTH
 %if cpuflag(avx2)
-    mova      m7, [deinterleave_shuf]
+    vbroadcasti128 m7, [deinterleave_shuf]
 %elif cpuflag(xop)
     mova      m6, [deinterleave_shuf32a]
     mova      m7, [deinterleave_shuf32b]
index 29c57dd76712c397d4a387daa69b56dc50e9b192..d8fbafcd593d733f86c95d94c23ba83c192623d6 100644 (file)
@@ -99,9 +99,6 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst,  intptr_t i_dst,
 void x264_plane_copy_interleave_core_avx( pixel *dst,  intptr_t i_dst,
                                           pixel *srcu, intptr_t i_srcu,
                                           pixel *srcv, intptr_t i_srcv, int w, int h );
-void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
-                                       pixel *dstv, intptr_t i_dstv,
-                                       pixel *src,  intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
                                         pixel *dstv, intptr_t i_dstv,
                                         pixel *src,  intptr_t i_src, int w, int h );
@@ -111,6 +108,9 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
 void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
                                        uint16_t *dstv, intptr_t i_dstv,
                                        uint16_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_avx2( pixel *dstu, intptr_t i_dstu,
+                                        pixel *dstv, intptr_t i_dstv,
+                                        pixel *src,  intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
                                              pixel *dstb, intptr_t i_dstb,
                                              pixel *dstc, intptr_t i_dstc,
@@ -135,11 +135,9 @@ void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
 void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_load_deinterleave_chroma_fenc_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fdec_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
@@ -543,11 +541,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_MMX) )
         return;
 
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
-
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
-
     pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
@@ -705,6 +698,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
+    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
 
     if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
@@ -730,10 +726,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         if( cpu&X264_CPU_SSE2_IS_FAST )
         {
             pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
-            pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
-            pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
             pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
-            pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
             pf->mc_luma = mc_luma_sse2;
             pf->get_ref = get_ref_sse2;
             if( cpu&X264_CPU_CACHELINE_64 )
@@ -847,6 +840,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_AVX2) )
         return;
     pf->plane_copy_swap = x264_plane_copy_swap_avx2;
+    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
     pf->get_ref = get_ref_avx2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
index fc7c17f12caed07986f1a7e00c9e826ae01f80c2..27db1bdf376ec0e8ae59341c54bdd5a4557c3822 100644 (file)
@@ -100,10 +100,10 @@ static void x264_frame_dump( x264_t *h )
         {
             int cw = h->param.i_width>>1;
             int ch = h->param.i_height>>CHROMA_V_SHIFT;
-            pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
+            pixel *planeu = x264_malloc( 2 * (cw*ch*sizeof(pixel) + 32) );
             if( planeu )
             {
-                pixel *planev = planeu + cw*ch + 16;
+                pixel *planev = planeu + cw*ch + 32/sizeof(pixel);
                 h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
                 fwrite( planeu, 1, cw*ch*sizeof(pixel), f );
                 fwrite( planev, 1, cw*ch*sizeof(pixel), f );
index 27496d879f7b3de290bdf347b93e112b23d32645..7427c338d73666448aac17a3a5a0a02fb18637a9 100644 (file)
@@ -1509,7 +1509,7 @@ static int check_mc( int cpu_ref, int cpu_new )
             int h = plane_specs[i].h;
             intptr_t dst_stride = w;
             intptr_t src_stride = (2*w + 127) & ~63;
-            intptr_t offv = (dst_stride*h + 31) & ~15;
+            intptr_t offv = (dst_stride*h + 63) & ~31;
             memset( pbuf3, 0, 0x1000 );
             memset( pbuf4, 0, 0x1000 );
             call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );