filt_mul51: times 8 db -5, 1
hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+%if HIGH_BIT_DEPTH
+deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
+deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
+%else
+deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
+deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
+%endif
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
pavgb %4, [r0+r5*2+%7]
PALIGNR %1, %3, 1, m6
PALIGNR %2, %4, 1, m6
+%if cpuflag(xop)
+ pavgb %1, %3
+ pavgb %2, %4
+%else
pavgb %1, %3
pavgb %2, %4
psrlw %5, %1, 8
psrlw %6, %2, 8
pand %1, m7
pand %2, m7
+%endif
%endmacro
%macro FILT16x2 4
pavgb %1, m3
PALIGNR m3, m2, 1, m6
pavgb m3, m2
+%if cpuflag(xop)
+ vpperm m5, m3, %1, m7
+ vpperm m3, m3, %1, m6
+%else
psrlw m5, m3, 8
psrlw m4, %1, 8
pand m3, m7
pand %1, m7
packuswb m3, %1
packuswb m5, m4
+%endif
mova [%2], m3
mova [%3], m5
mova %1, m2
pavgw %1, m3
PALIGNR m3, m2, 2, m6
pavgw m3, m2
+%if cpuflag(xop)
+ vpperm m5, m3, %1, m7
+ vpperm m3, m3, %1, m6
+%else
psrld m5, m3, 16
psrld m4, %1, 16
pand m3, m7
pand %1, m7
packssdw m3, %1
packssdw m5, m4
+%endif
mova [%2], m3
mova [%3], m5
mova %1, m2
PUSH r6
%define src_gap [rsp]
%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
pcmpeqw m7, m7
psrld m7, 16
+%endif
.vloop:
mov r6d, r7m
%ifnidn cpuname, mmx2
sub r4, r6
add dst_gap, r6d
%endif ; mmsize
+%if cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
pcmpeqb m7, m7
psrlw m7, 8
+%endif
.vloop:
mov r6d, r7m
%ifnidn cpuname, mmx2
jz .hloop
sub r0, 16
FILT8x4 m0, m1, m2, m3, m4, m5, 0
+%if cpuflag(xop)
+ mova m4, m0
+ vpperm m0, m4, m1, m6
+ vpperm m1, m4, m1, m7
+ movq [r1], m0
+ movq [r2], m1
+ movhps [r3], m0
+ movhps [r4], m1
+%else
packuswb m0, m4
packuswb m1, m5
movq [r1], m0
movhps [r2], m0
movq [r3], m1
movhps [r4], m1
+%endif
mova m0, m2
mova m1, m3
sub r6d, 8
mova m8, m0
mova m9, m1
FILT8x4 m2, m3, m0, m1, m4, m5, 0
+%if cpuflag(xop)
+ vpperm m4, m2, m8, m7
+ vpperm m2, m2, m8, m6
+ vpperm m5, m3, m9, m7
+ vpperm m3, m3, m9, m6
+%else
packuswb m2, m8
packuswb m3, m9
packuswb m4, m10
packuswb m5, m11
+%endif
mova [r1], m2
mova [r2], m4
mova [r3], m3
FRAME_INIT_LOWRES
INIT_XMM ssse3
FRAME_INIT_LOWRES
+INIT_XMM avx
+FRAME_INIT_LOWRES
+INIT_XMM xop
+FRAME_INIT_LOWRES
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
LOWRES(cache32_mmx2)
LOWRES(sse2)
LOWRES(ssse3)
+LOWRES(avx)
+LOWRES(xop)
#define PIXEL_AVG_W(width,cpu)\
void x264_pixel_avg2_w##width##_##cpu( pixel *, int, pixel *, int, pixel *, int );
if( !(cpu&X264_CPU_AVX) )
return;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
+
+ if( cpu&X264_CPU_XOP )
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( !(cpu&X264_CPU_AVX) )
return;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
pf->integral_init8h = x264_integral_init8h_avx;
pf->hpel_filter = x264_hpel_filter_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
+
+ if( cpu&X264_CPU_XOP )
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_AVX) )