1-2 cycles faster and avoids some code duplication to decrease code size.
Also drop the MMX2 implementation in favor of SSE2 to simplify things.
#if HIGH_BIT_DEPTH
#define x264_predict_8x8c_v_mmx2 x264_predict_8x8c_v_mmx
#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_c
+#define x264_predict_16x16_dc_mmx2 x264_predict_16x16_dc_c
#define x264_predict_8x8c_v_sse2 x264_predict_8x8c_v_sse
#define x264_predict_8x16c_v_sse2 x264_predict_8x16c_v_sse
#define x264_predict_16x16_v_sse2 x264_predict_16x16_v_sse
%endif
;-----------------------------------------------------------------------------
-; void predict_16x16_dc_core( pixel *src, int i_dc_left )
+; void predict_16x16_dc( pixel *src )
;-----------------------------------------------------------------------------
-%macro PRED16x16_DC_MMX 2
-%if HIGH_BIT_DEPTH
- mova m0, [r0 - FDEC_STRIDEB+ 0]
- paddw m0, [r0 - FDEC_STRIDEB+ 8]
- paddw m0, [r0 - FDEC_STRIDEB+16]
- paddw m0, [r0 - FDEC_STRIDEB+24]
- HADDW m0, m1
- paddw m0, %1
- psrlw m0, %2
- SPLATW m0, m0
- STORE16 m0, m0, m0, m0
-%else ; !HIGH_BIT_DEPTH
- pxor m0, m0
- pxor m1, m1
- psadbw m0, [r0 - FDEC_STRIDE]
- psadbw m1, [r0 - FDEC_STRIDE + 8]
- paddusw m0, m1
- paddusw m0, %1
- psrlw m0, %2 ; dc
- pshufw m0, m0, 0
- packuswb m0, m0 ; dc in bytes
- STORE16 m0, m0
-%endif
-%endmacro
-
-INIT_MMX mmx2
-cglobal predict_16x16_dc_core, 1,2
-%if ARCH_X86_64
- movd m6, r1d
- PRED16x16_DC_MMX m6, 5
+%if WIN64
+DECLARE_REG_TMP 6 ; Reduces code size due to fewer REX prefixes
%else
- PRED16x16_DC_MMX r1m, 5
+DECLARE_REG_TMP 3
%endif
- RET
-
-INIT_MMX mmx2
-cglobal predict_16x16_dc_top, 1,2
- PRED16x16_DC_MMX [pw_8], 4
- RET
-INIT_MMX mmx2
-%if HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,2
- movd m0, r1m
- SPLATW m0, m0
- STORE16 m0, m0, m0, m0
- RET
-%else ; !HIGH_BIT_DEPTH
-cglobal predict_16x16_dc_left_core, 1,1
- movd m0, r1m
- pshufw m0, m0, 0
- packuswb m0, m0
- STORE16 m0, m0
+INIT_XMM
+; Returns the sum of the left pixels in r1d+r2d
+cglobal predict_16x16_dc_left_internal, 0,4
+ movzx r1d, pixel [r0-SIZEOF_PIXEL]
+ movzx r2d, pixel [r0+FDEC_STRIDEB-SIZEOF_PIXEL]
+%assign i 2*FDEC_STRIDEB
+%rep 7
+ movzx t0d, pixel [r0+i-SIZEOF_PIXEL]
+ add r1d, t0d
+ movzx t0d, pixel [r0+i+FDEC_STRIDEB-SIZEOF_PIXEL]
+ add r2d, t0d
+%assign i i+2*FDEC_STRIDEB
+%endrep
RET
-%endif
%macro PRED16x16_DC 2
%if HIGH_BIT_DEPTH
%endif
%endmacro
-%macro PREDICT_16x16_DC_CORE 0
-cglobal predict_16x16_dc_core, 2,2,4
- movd xm3, r1m
+%macro PREDICT_16x16_DC 0
+cglobal predict_16x16_dc, 1,3
+ call predict_16x16_dc_left_internal
+ lea r1d, [r1+r2+16]
+ movd xm3, r1d
PRED16x16_DC xm3, 5
RET
PRED16x16_DC [pw_8], 4
RET
-cglobal predict_16x16_dc_left_core, 1,2
- movd xm0, r1m
+cglobal predict_16x16_dc_left, 1,3
+ call predict_16x16_dc_left_internal
+ lea r1d, [r1+r2+8]
+ shr r1d, 4
+ movd xm0, r1d
SPLATW m0, xm0
%if HIGH_BIT_DEPTH && mmsize == 16
STORE16 m0, m0
%endmacro
INIT_XMM sse2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
%if HIGH_BIT_DEPTH
INIT_YMM avx2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
%else
INIT_XMM avx2
-PREDICT_16x16_DC_CORE
+PREDICT_16x16_DC
%endif
#include "predict.h"
#include "pixel.h"
-#define PREDICT_16x16_DC(name)\
-void x264_predict_16x16_dc_##name( pixel *src )\
-{\
- uint32_t dc = 16;\
- for( int i = 0; i < 16; i += 2 )\
- {\
- dc += src[-1 + i * FDEC_STRIDE];\
- dc += src[-1 + (i+1) * FDEC_STRIDE];\
- }\
- x264_predict_16x16_dc_core_##name( src, dc );\
-}
-
-PREDICT_16x16_DC( mmx2 )
-PREDICT_16x16_DC( sse2 )
-PREDICT_16x16_DC( avx2 )
-
-#define PREDICT_16x16_DC_LEFT(name)\
-static void x264_predict_16x16_dc_left_##name( pixel *src )\
-{\
- uint32_t dc = 8;\
- for( int i = 0; i < 16; i += 2 )\
- {\
- dc += src[-1 + i * FDEC_STRIDE];\
- dc += src[-1 + (i+1) * FDEC_STRIDE];\
- }\
- x264_predict_16x16_dc_left_core_##name( src, dc>>4 );\
-}
-
-PREDICT_16x16_DC_LEFT( mmx2 )
-PREDICT_16x16_DC_LEFT( sse2 )
-PREDICT_16x16_DC_LEFT( avx2 )
-
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
{
if( !(cpu&X264_CPU_MMX2) )
return;
- pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmx2;
- pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmx2;
- pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmx2;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
#if HIGH_BIT_DEPTH
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
void x264_predict_16x16_h_avx2( uint16_t *src );
-void x264_predict_16x16_dc_mmx2( pixel *src );
void x264_predict_16x16_dc_sse2( pixel *src );
-void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_core_avx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_left_core_avx2( pixel *src, int i_dc_left );
-void x264_predict_16x16_dc_top_mmx2( pixel *src );
+void x264_predict_16x16_dc_avx2( pixel *src );
+void x264_predict_16x16_dc_left_sse2( pixel *src );
+void x264_predict_16x16_dc_left_avx2( pixel *src );
void x264_predict_16x16_dc_top_sse2( pixel *src );
void x264_predict_16x16_dc_top_avx2( pixel *src );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );