#include "common/common.h"
/* NASM functions */
+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
NULL,
x264_mc_copy_w16_mmx
};
+static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
+{
+ NULL,
+ x264_pixel_avg2_w4_mmxext,
+ x264_pixel_avg2_w8_mmxext,
+ x264_pixel_avg2_w12_mmxext,
+ x264_pixel_avg2_w16_sse2,
+ x264_pixel_avg2_w20_sse2,
+};
+static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) =
+{
+ NULL,
+ x264_mc_copy_w4_mmx,
+ x264_mc_copy_w8_mmx,
+ NULL,
+ x264_mc_copy_w16_sse2,
+};
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-void mc_luma_mmxext( uint8_t *dst, int i_dst_stride,
- uint8_t *src[4], int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height )
-{
- int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
-
- if( qpel_idx & 5 ) /* qpel interpolation needed */
- {
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
- x264_pixel_avg_wtab_mmxext[i_width>>2](
- dst, i_dst_stride, src1, i_src_stride,
- src2, i_height );
- }
- else
- {
- x264_mc_copy_wtab_mmx[i_width>>2](
- dst, i_dst_stride, src1, i_src_stride, i_height );
- }
+#define MC_LUMA(name,instr1,instr2)\
+void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
+ uint8_t *src[4], int i_src_stride,\
+ int mvx, int mvy,\
+ int i_width, int i_height )\
+{\
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+ if( qpel_idx & 5 ) /* qpel interpolation needed */\
+ {\
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+ x264_pixel_avg_wtab_##instr1[i_width>>2](\
+ dst, i_dst_stride, src1, i_src_stride,\
+ src2, i_height );\
+ }\
+ else\
+ {\
+ x264_mc_copy_wtab_##instr2[i_width>>2](\
+ dst, i_dst_stride, src1, i_src_stride, i_height );\
+ }\
}
-uint8_t *get_ref_mmxext( uint8_t *dst, int *i_dst_stride,
- uint8_t *src[4], int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height )
-{
- int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
-
- if( qpel_idx & 5 ) /* qpel interpolation needed */
- {
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
- x264_pixel_avg_wtab_mmxext[i_width>>2](
- dst, *i_dst_stride, src1, i_src_stride,
- src2, i_height );
- return dst;
- }
- else
- {
- *i_dst_stride = i_src_stride;
- return src1;
- }
+MC_LUMA(mmxext,mmxext,mmx)
+MC_LUMA(sse2,sse2,sse2)
+
+#define GET_REF(name)\
+uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
+ uint8_t *src[4], int i_src_stride,\
+ int mvx, int mvy,\
+ int i_width, int i_height )\
+{\
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+ if( qpel_idx & 5 ) /* qpel interpolation needed */\
+ {\
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+ x264_pixel_avg_wtab_##name[i_width>>2](\
+ dst, *i_dst_stride, src1, i_src_stride,\
+ src2, i_height );\
+ return dst;\
+ }\
+ else\
+ {\
+ *i_dst_stride = i_src_stride;\
+ return src1;\
+ }\
}
+GET_REF(mmxext)
+GET_REF(sse2)
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_SSE2) )
return;
-
+
pf->memcpy_aligned = x264_memcpy_aligned_sse2;
+
+ // disable on AMD processors since it is slower
+ if( cpu&X264_CPU_3DNOW )
+ return;
+
+ pf->mc_luma = mc_luma_sse2;
+ pf->get_ref = get_ref_sse2;
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
}