#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
- INIT7( ssd, _mmx );
+ INIT8( ssd, _mmx );
}
if( cpu&X264_CPU_MMX2 )
{
- INIT7( sad, _mmx2 );
- INIT7_NAME( sad_aligned, sad, _mmx2 );
+ INIT8( sad, _mmx2 );
+ INIT8_NAME( sad_aligned, sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
INIT7( satd, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT_ADS( _mmx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
#if ARCH_X86
}
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT_ADS( _ssse3 );
if( !(cpu&X264_CPU_SLOW_ATOM) )
{
- INIT7( ssd, _ssse3 );
+ INIT8( ssd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
INIT7( satd, _ssse3 );
#endif
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c;
pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c;
pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c;
+
+#if HAVE_MMX
+ x264_predict_8x16c_init_mmx( cpu, pf );
+#endif
}
void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
INIT_MMX mmx2
AVG_FUNC 4, movq, movq
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
INIT_XMM sse2
AVG_FUNC 4, movq, movq
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
INIT_MMX mmx2
AVG_FUNC 4, movd, movd
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
AVGH 8, 8
AVGH 8, 4
INIT_MMX ssse3
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
DECL_SUF( x264_pixel_avg_8x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x8, ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x4, ( pixel *, int, pixel *, int, pixel *, int, int ))
+DECL_SUF( x264_pixel_avg_4x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x8, ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x4, ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x2, ( pixel *, int, pixel *, int, pixel *, int, int ))
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmx2;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmx2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmx2;
+ pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_mmx2;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmx2;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmx2;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmx2;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
+ pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sse2;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sse2;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sse2;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sse2;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3;
+ pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_ssse3;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
SSD 4, 4
SSD 8, 4
SSD 4, 8
+SSD 4, 16
INIT_XMM sse2slow
SSD 16, 16
SSD 8, 8
INIT_MMX ssse3
SSD 4, 4
SSD 4, 8
+SSD 4, 16
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
+cglobal pixel_var_8x16_mmx2, 2,3
+ FIX_STRIDES r1
+ VAR_START 0
+ VAR_2ROW r1, 8
+ VAR_END 8, 16
+
cglobal pixel_var_8x8_mmx2, 2,3
FIX_STRIDES r1
VAR_START 0
dec r2d
jg .loop
VAR_END 8, 8
+
+cglobal pixel_var_8x16, 2,4,8
+ VAR_START 1
+ mov r2d, 4
+ lea r3, [r1*3]
+.loop:
+ movh m0, [r0]
+ movh m3, [r0+r1]
+ movhps m0, [r0+r1*2]
+ movhps m3, [r0+r3]
+ DEINTB 1, 0, 4, 3, 7
+ lea r0, [r0+r1*4]
+ VAR_CORE
+ dec r2d
+ jg .loop
+ VAR_END 8, 16
%endmacro ; VAR
INIT_XMM sse2
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
+ ret x264_pixel_##name##_4x16_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;\
cextern pw_00ff
cextern pw_pixel_max
-%macro STORE8x8 2
+%macro STORE8x8 2-4
add r0, 4*FDEC_STRIDEB
mova [r0 + -4*FDEC_STRIDEB], %1
mova [r0 + -3*FDEC_STRIDEB], %1
mova [r0 + 3*FDEC_STRIDEB], %2
%endmacro
+%macro STORE8x16 4
+ add r0, 4*FDEC_STRIDEB
+ mova [r0 + -4*FDEC_STRIDEB], %1
+ mova [r0 + -3*FDEC_STRIDEB], %1
+ mova [r0 + -2*FDEC_STRIDEB], %1
+ mova [r0 + -1*FDEC_STRIDEB], %1
+ add r0, 4*FDEC_STRIDEB
+ mova [r0 + -4*FDEC_STRIDEB], %2
+ mova [r0 + -3*FDEC_STRIDEB], %2
+ mova [r0 + -2*FDEC_STRIDEB], %2
+ mova [r0 + -1*FDEC_STRIDEB], %2
+ add r0, 4*FDEC_STRIDEB
+ mova [r0 + -4*FDEC_STRIDEB], %3
+ mova [r0 + -3*FDEC_STRIDEB], %3
+ mova [r0 + -2*FDEC_STRIDEB], %3
+ mova [r0 + -1*FDEC_STRIDEB], %3
+ mova [r0 + 0*FDEC_STRIDEB], %4
+ mova [r0 + 1*FDEC_STRIDEB], %4
+ mova [r0 + 2*FDEC_STRIDEB], %4
+ mova [r0 + 3*FDEC_STRIDEB], %4
+%endmacro
+
%macro STORE16x16 2-4
%ifidn %0, 4
mov r1d, 8
%endif
+%macro PREDICT_8x16C_V 0
+cglobal predict_8x16c_v, 1,1
+ mova m0, [r0 - FDEC_STRIDEB]
+ STORE8x16 m0, m0, m0, m0
+ RET
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM sse2
+PREDICT_8x16C_V
+%else
+INIT_MMX mmx
+PREDICT_8x16C_V
+%endif
+
;-----------------------------------------------------------------------------
; void predict_8x8c_h( uint8_t *src )
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal predict_8x8c_h, 1,1
+%macro PREDICT_C_H 1
+cglobal predict_8x%1c_h, 1,1
add r0, FDEC_STRIDEB*4
%assign Y -4
-%rep 8
+%rep %1
movd m0, [r0+FDEC_STRIDEB*Y-SIZEOF_PIXEL*2]
SPLATW m0, m0, 1
mova [r0+FDEC_STRIDEB*Y], m0
%assign Y Y+1
%endrep
RET
+%endmacro
+
+PREDICT_C_H 8
+PREDICT_C_H 16
%else ; !HIGH_BIT_DEPTH
-%macro PREDICT_8x8C_H 0
-cglobal predict_8x8c_h, 1,1
-%if cpuflag(ssse3)
- mova m1, [pb_3]
-%endif
- add r0, FDEC_STRIDE*4
-%assign Y -4
-%rep 8
+%macro PREDICT_C_H_CORE 1
+%assign Y %1
+%rep 4
SPLATB_LOAD m0, r0+FDEC_STRIDE*Y-1, m1
mova [r0+FDEC_STRIDE*Y], m0
%assign Y Y+1
%endrep
+%endmacro
+
+%macro PREDICT_C_H 1
+cglobal predict_8x%1c_h, 1,1
+%if cpuflag(ssse3)
+ mova m1, [pb_3]
+%endif
+%if %1==16
+ add r0, FDEC_STRIDE*4
+ PREDICT_C_H_CORE -4
+ add r0, FDEC_STRIDE*4
+ PREDICT_C_H_CORE -4
+%endif
+ add r0, FDEC_STRIDE*4
+ PREDICT_C_H_CORE -4
+ PREDICT_C_H_CORE 0
RET
%endmacro
INIT_MMX mmx2
-PREDICT_8x8C_H
+PREDICT_C_H 8
+PREDICT_C_H 16
INIT_MMX ssse3
-PREDICT_8x8C_H
+PREDICT_C_H 8
+PREDICT_C_H 16
%endif
;-----------------------------------------------------------------------------
PREDICT_8x8C_DC
%endif
+%macro PREDICT_C_DC_TOP 1
%ifdef HIGH_BIT_DEPTH
INIT_XMM
-cglobal predict_8x8c_dc_top_sse2, 1,1
+cglobal predict_8x%1c_dc_top_sse2, 1,1
pxor m2, m2
mova m0, [r0 - FDEC_STRIDEB]
pshufd m1, m0, q2301
paddw m0, m1
psrlw m0, 1
pavgw m0, m2
- STORE8x8 m0, m0
+ STORE8x%1 m0, m0, m0, m0
RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
-cglobal predict_8x8c_dc_top_mmx2, 1,1
+cglobal predict_8x%1c_dc_top_mmx2, 1,1
movq mm0, [r0 - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
packuswb mm0, mm1 ; dc0,dc1 (b)
- STORE8x8 mm0, mm0
+ STORE8x%1 mm0, mm0, mm0, mm0
RET
%endif
+%endmacro
+
+PREDICT_C_DC_TOP 8
+PREDICT_C_DC_TOP 16
;-----------------------------------------------------------------------------
; void predict_16x16_v( pixel *src )
#endif // HIGH_BIT_DEPTH
}
+void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
+{
+ if( !(cpu&X264_CPU_MMX) )
+ return;
+#if HIGH_BIT_DEPTH
+ pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse2;
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
+#else
+ pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
+ if( !(cpu&X264_CPU_MMX2) )
+ return;
+ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
+ if( !(cpu&X264_CPU_SSSE3) )
+ return;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_ssse3;
+#endif // HIGH_BIT_DEPTH
+}
+
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
{
if( !(cpu&X264_CPU_MMX2) )
#define X264_I386_PREDICT_H
void x264_predict_16x16_init_mmx ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init_mmx ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x8c_init_mmx ( int cpu, x264_predict_t pf[7] );
void x264_predict_4x4_init_mmx ( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_mmx ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
+void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
+void x264_predict_8x16c_v_mmx( uint8_t *src );
+void x264_predict_8x16c_v_sse2( uint16_t *src );
+void x264_predict_8x16c_h_mmx2( uint8_t *src );
+void x264_predict_8x16c_h_sse2( pixel *src );
+void x264_predict_8x16c_h_ssse3( uint8_t *src );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmx2( pixel *src );
SAD 8, 16
SAD 8, 8
SAD 8, 4
+SAD 4, 16
SAD 4, 8
SAD 4, 4