int i_mb_count = h->mb.i_mb_count;
int i_stride, i_width, i_lines;
int i_padv = PADV << h->param.b_interlaced;
+ int luma_plane_size;
if( !frame ) return NULL;
frame->i_stride[i] = i_stride >> !!i;
frame->i_width[i] = i_width >> !!i;
frame->i_lines[i] = i_lines >> !!i;
- CHECKED_MALLOC( frame->buffer[i],
- frame->i_stride[i] * (i_lines + 2*i_padv) >> !!i );
- frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
- ((frame->i_stride[i] * i_padv + PADH) >> !!i);
}
- frame->filtered[0] = frame->plane[0];
- for( i = 0; i < 3; i++ )
+ luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
+ for( i = 1; i < 3; i++ )
{
- CHECKED_MALLOC( frame->buffer[4+i],
- frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ) );
- frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) +
- frame->i_stride[0] * i_padv + PADH;
+ CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
+ frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
+ /* all 4 luma planes allocated together, since the cacheline split code
+ * requires them to be in-phase wrt cacheline alignment. */
+ CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
+ for( i = 0; i < 4; i++ )
+ frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+ frame->plane[0] = frame->filtered[0];
if( h->frames.b_have_lowres )
{
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
- CHECKED_MALLOC( frame->buffer[7],
+ CHECKED_MALLOC( frame->buffer[3],
2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
- frame->integral = (uint16_t*)frame->buffer[7] + frame->i_stride[0] * i_padv + PADH;
+ frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
}
frame->i_poc = -1;
void x264_frame_delete( x264_frame_t *frame )
{
int i, j;
- for( i = 0; i < 8; i++ )
+ for( i = 0; i < 4; i++ )
x264_free( frame->buffer[i] );
for( i = 0; i < 4; i++ )
x264_free( frame->buffer_lowres[i] );
pw_8: times 4 dw 8
pw_32: times 4 dw 32
pw_64: times 4 dw 64
+sw_64: dd 64
SECTION .text
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w16_sse2, 6,7
+%macro PIXEL_AVG_SSE 1
+cglobal x264_pixel_avg2_w16_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
jg .height_loop
REP_RET
-cglobal x264_pixel_avg2_w20_sse2, 6,7
+cglobal x264_pixel_avg2_w20_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
sub r5d, 2
jg .height_loop
REP_RET
+%endmacro
+
+PIXEL_AVG_SSE sse2
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+PIXEL_AVG_SSE sse3
+%undef movdqu
+%endif
+
+; Cacheline split code for processors with high latencies for loads
+; split over cache lines. See sad-a.asm for a more detailed explanation.
+; This particular instance is complicated by the fact that src1 and src2
+; can have different alignments. For simplicity and code size, only the
+; MMX cacheline workaround is used. As a result, in the case of SSE2
+; pixel_avg, the cacheline check functions calls the SSE2 version if there
+; is no cacheline split, and the MMX workaround if there is.
+
+%macro INIT_SHIFT 2
+ and eax, 7
+ shl eax, 3
+%ifdef PIC32
+ ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
+ mov r2, 64
+ sub r2, eax
+ movd %2, eax
+ movd %1, r2
+%else
+ movd %1, [sw_64 GLOBAL]
+ movd %2, eax
+ psubw %1, %2
+%endif
+%endmacro
+
+%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
+cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0
+ mov eax, r2m
+ and eax, 0x1f|(%2>>1)
+ cmp eax, (32-%1)|(%2>>1)
+ jle x264_pixel_avg2_w%1_%3
+;w12 isn't needed because w16 is just as fast if there's no cacheline split
+%if %1 == 12
+ jmp x264_pixel_avg2_w16_cache_mmxext
+%else
+ jmp x264_pixel_avg2_w%1_cache_mmxext
+%endif
+%endmacro
+
+%macro AVG_CACHELINE_START 0
+ %assign stack_offset 0
+ INIT_SHIFT mm6, mm7
+ mov eax, r4m
+ INIT_SHIFT mm4, mm5
+ PROLOGUE 6,6,0
+ and r2, ~7
+ and r4, ~7
+ sub r4, r2
+.height_loop:
+%endmacro
+%macro AVG_CACHELINE_LOOP 2
+ movq mm0, [r2+8+%1]
+ movq mm1, [r2+%1]
+ movq mm2, [r2+r4+8+%1]
+ movq mm3, [r2+r4+%1]
+ psllq mm0, mm6
+ psrlq mm1, mm7
+ psllq mm2, mm4
+ psrlq mm3, mm5
+ por mm0, mm1
+ por mm2, mm3
+ pavgb mm0, mm2
+ %2 [r0+%1], mm0
+%endmacro
+x264_pixel_avg2_w8_cache_mmxext:
+ AVG_CACHELINE_START
+ AVG_CACHELINE_LOOP 0, movq
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg .height_loop
+ RET
+
+x264_pixel_avg2_w16_cache_mmxext:
+ AVG_CACHELINE_START
+ AVG_CACHELINE_LOOP 0, movq
+ AVG_CACHELINE_LOOP 8, movq
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg .height_loop
+ RET
+
+x264_pixel_avg2_w20_cache_mmxext:
+ AVG_CACHELINE_START
+ AVG_CACHELINE_LOOP 0, movq
+ AVG_CACHELINE_LOOP 8, movq
+ AVG_CACHELINE_LOOP 16, movd
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg .height_loop
+ RET
+
+%ifndef ARCH_X86_64
+AVG_CACHELINE_CHECK 8, 32, mmxext
+AVG_CACHELINE_CHECK 12, 32, mmxext
+AVG_CACHELINE_CHECK 16, 32, mmxext
+AVG_CACHELINE_CHECK 20, 32, mmxext
+AVG_CACHELINE_CHECK 16, 64, mmxext
+AVG_CACHELINE_CHECK 20, 64, mmxext
+%endif
+
+AVG_CACHELINE_CHECK 8, 64, mmxext
+AVG_CACHELINE_CHECK 12, 64, mmxext
+AVG_CACHELINE_CHECK 16, 64, sse2
+AVG_CACHELINE_CHECK 20, 64, sse2
;=============================================================================
; pixel copy
%endmacro
COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
+; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
+; but with SSE3 the overhead is zero, so there's no reason not to include it.
+%ifdef HAVE_SSE3
+COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
+%endif
COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
+#define PIXEL_AVG_W(width,cpu)\
+extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+/* This declares some functions that don't exist, but that isn't a problem. */
+#define PIXEL_AVG_WALL(cpu)\
+PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu);
+
+PIXEL_AVG_WALL(mmxext)
+PIXEL_AVG_WALL(cache32_mmxext)
+PIXEL_AVG_WALL(cache64_mmxext)
+PIXEL_AVG_WALL(cache64_sse2)
+PIXEL_AVG_WALL(sse2)
+PIXEL_AVG_WALL(sse3)
+
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
{ \
AVG_WEIGHT(8,8)
AVG_WEIGHT(8,4)
-static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
-{
- NULL,
- x264_pixel_avg2_w4_mmxext,
- x264_pixel_avg2_w8_mmxext,
- x264_pixel_avg2_w12_mmxext,
- x264_pixel_avg2_w16_mmxext,
- x264_pixel_avg2_w20_mmxext,
-};
-static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =
-{
- NULL,
- x264_mc_copy_w4_mmx,
- x264_mc_copy_w8_mmx,
- NULL,
- x264_mc_copy_w16_mmx
-};
-static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
-{
- NULL,
- x264_pixel_avg2_w4_mmxext,
- x264_pixel_avg2_w8_mmxext,
- x264_pixel_avg2_w12_mmxext,
- x264_pixel_avg2_w16_sse2,
- x264_pixel_avg2_w20_sse2,
+#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
+static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
+{\
+ NULL,\
+ x264_pixel_avg2_w4_##name1,\
+ x264_pixel_avg2_w8_##name2,\
+ x264_pixel_avg2_w12_##name3,\
+ x264_pixel_avg2_w16_##name4,\
+ x264_pixel_avg2_w20_##name5,\
};
-static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) =
-{
- NULL,
- x264_mc_copy_w4_mmx,
- x264_mc_copy_w8_mmx,
- NULL,
- x264_mc_copy_w16_sse2,
+
+/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
+#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
+#define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3
+
+PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
+#ifdef ARCH_X86
+PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
+#endif
+PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
+PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
+PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
+#ifdef HAVE_SSE3
+PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
+#endif
+
+#define MC_COPY_WTAB(instr, name1, name2, name3)\
+static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
+{\
+ NULL,\
+ x264_mc_copy_w4_##name1,\
+ x264_mc_copy_w8_##name2,\
+ NULL,\
+ x264_mc_copy_w16_##name3,\
};
+
+MC_COPY_WTAB(mmx,mmx,mmx,mmx)
+MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+#ifdef HAVE_SSE3
+MC_COPY_WTAB(sse3,mmx,mmx,sse3)
+#endif
+
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
}
MC_LUMA(mmxext,mmxext,mmx)
+#ifdef ARCH_X86
+MC_LUMA(cache32_mmxext,cache32_mmxext,mmx)
+MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
+#endif
MC_LUMA(sse2,sse2,sse2)
+MC_LUMA(cache64_sse2,cache64_sse2,sse2)
+#ifdef HAVE_SSE3
+MC_LUMA(cache64_sse3,cache64_sse3,sse3)
+#endif
#define GET_REF(name)\
uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
}
GET_REF(mmxext)
+#ifdef ARCH_X86
+GET_REF(cache32_mmxext)
+GET_REF(cache64_mmxext)
+#endif
GET_REF(sse2)
+GET_REF(cache64_sse2)
+#ifdef HAVE_SSE3
+GET_REF(cache64_sse3)
+#endif
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
+#ifdef ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
+ if( cpu&X264_CPU_CACHELINE_32 )
+ {
+ pf->mc_luma = mc_luma_cache32_mmxext;
+ pf->get_ref = get_ref_cache32_mmxext;
+ }
+ else if( cpu&X264_CPU_CACHELINE_SPLIT )
+ {
+ pf->mc_luma = mc_luma_cache64_mmxext;
+ pf->get_ref = get_ref_cache64_mmxext;
+ }
+#endif
+
if( !(cpu&X264_CPU_SSE2) )
return;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
+ if( cpu&X264_CPU_CACHELINE_SPLIT )
+ {
+ pf->mc_luma = mc_luma_cache64_sse2;
+ pf->get_ref = get_ref_cache64_sse2;
+#ifdef HAVE_SSE3
+ /* lddqu doesn't work on Core2 */
+ if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
+ {
+ pf->mc_luma = mc_luma_cache64_sse3;
+ pf->get_ref = get_ref_cache64_sse3;
+ }
+#endif
+ }
+
if( !(cpu&X264_CPU_SSSE3) )
return;