rounding is changed for asm convenience. this makes the c version slower, but there's no way around that if all the implementations are to have the same results.
if( h->frames.b_have_lowres )
{
frame->i_width_lowres = frame->i_width[0]/2;
- frame->i_stride_lowres = frame->i_width_lowres + 2*PADH;
+ frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15;
frame->i_lines_lowres = frame->i_lines[0]/2;
for( i = 0; i < 4; i++ )
{
memset( dst, 0, n );
}
+void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
+{
+ uint8_t *src = frame->plane[0];
+ int i_stride = frame->i_stride[0];
+ int i_height = frame->i_lines[0];
+ int i_width = frame->i_width[0];
+ int x, y;
+
+ // duplicate last row and column so that their interpolation doesn't have to be special-cased
+ for( y=0; y<i_height; y++ )
+ src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
+ h->mc.memcpy_aligned( src+i_stride*i_height, src+i_stride*(i_height-1), i_width );
+ h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
+ i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
+ x264_frame_expand_border_lowres( frame );
+
+ for( y=0; y<16; y++ )
+ for( x=0; x<16; x++ )
+ frame->i_cost_est[y][x] = -1;
+}
+
+static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+ int src_stride, int dst_stride, int width, int height )
+{
+ int x,y;
+ for( y=0; y<height; y++ )
+ {
+ uint8_t *src1 = src0+src_stride;
+ uint8_t *src2 = src1+src_stride;
+ for( x=0; x<width; x++ )
+ {
+ // slower than naive bilinear, but matches asm
+#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
+ dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1], src1[2*x+1]);
+ dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
+ dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1], src2[2*x+1]);
+ dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
+#undef FILTER
+ }
+ src0 += src_stride*2;
+ dst0 += dst_stride;
+ dsth += dst_stride;
+ dstv += dst_stride;
+ dstc += dst_stride;
+ }
+}
+
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
pf->prefetch_ref = prefetch_ref_null;
pf->memcpy_aligned = memcpy;
pf->memzero_aligned = memzero_aligned;
+ pf->frame_init_lowres_core = frame_init_lowres_core;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
}
}
}
-
-void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
-{
- // FIXME: tapfilter?
- const int i_stride = frame->i_stride[0];
- const int i_stride2 = frame->i_stride_lowres;
- const int i_width2 = frame->i_width_lowres;
- int x, y, i;
- for( y = 0; y < frame->i_lines_lowres - 1; y++ )
- {
- uint8_t *src0 = &frame->plane[0][2*y*i_stride];
- uint8_t *src1 = src0+i_stride;
- uint8_t *src2 = src1+i_stride;
- uint8_t *dst0 = &frame->lowres[0][y*i_stride2];
- uint8_t *dsth = &frame->lowres[1][y*i_stride2];
- uint8_t *dstv = &frame->lowres[2][y*i_stride2];
- uint8_t *dstc = &frame->lowres[3][y*i_stride2];
- for( x = 0; x < i_width2 - 1; x++ )
- {
- dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2;
- dsth[x] = (src0[2*x+1] + src0[2*x+2] + src1[2*x+1] + src1[2*x+2] + 2) >> 2;
- dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2;
- dstc[x] = (src1[2*x+1] + src1[2*x+2] + src2[2*x+1] + src2[2*x+2] + 2) >> 2;
- }
- dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2;
- dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2;
- dsth[x] = (src0[2*x+1] + src1[2*x+1] + 1) >> 1;
- dstc[x] = (src1[2*x+1] + src2[2*x+1] + 1) >> 1;
- }
- for( i = 0; i < 4; i++ )
- memcpy( &frame->lowres[i][y*i_stride2], &frame->lowres[i][(y-1)*i_stride2], i_width2 );
-
- for( y = 0; y < 16; y++ )
- for( x = 0; x < 16; x++ )
- frame->i_cost_est[x][y] = -1;
-
- x264_frame_expand_border_lowres( frame );
-}
-
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
+ void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+ int src_stride, int dst_stride, int width, int height );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
packuswb m1, m4
%endmacro
-%macro PALIGNR_SSE2 4
- %ifnidn %2, %4
- movdqa %4, %2
+%macro PALIGNR_MMX 4
+ %ifnidn %4, %2
+ mova %4, %2
%endif
- pslldq %1, 16-%3
- psrldq %4, %3
- por %1, %4
+ %if regsize == 8
+ psllq %1, (8-%3)*8
+ psrlq %4, %3*8
+ %else
+ pslldq %1, 16-%3
+ psrldq %4, %3
+ %endif
+ por %1, %4
%endmacro
%macro PALIGNR_SSSE3 4
jl .loop
REP_RET
-%define PALIGNR PALIGNR_SSE2
+%define PALIGNR PALIGNR_MMX
HPEL_V sse2
HPEL_C sse2
%define PALIGNR PALIGNR_SSSE3
MEMZERO mmx
INIT_XMM
MEMZERO sse2
+
+
+
+%macro FILT8x4 7
+ mova %3, [r0+%7]
+ mova %4, [r0+r5+%7]
+ pavgb %3, %4
+ pavgb %4, [r0+r5*2+%7]
+ PALIGNR %1, %3, 1, m6
+ PALIGNR %2, %4, 1, m6
+ pavgb %1, %3
+ pavgb %2, %4
+ mova %5, %1
+ mova %6, %2
+ pand %1, m7
+ pand %2, m7
+ psrlw %5, 8
+ psrlw %6, 8
+%endmacro
+
+%macro FILT16x2 4
+ mova m3, [r0+%4+regsize]
+ mova m2, [r0+%4]
+ pavgb m3, [r0+%4+r5+regsize]
+ pavgb m2, [r0+%4+r5]
+ PALIGNR %1, m3, 1, m6
+ pavgb %1, m3
+ PALIGNR m3, m2, 1, m6
+ pavgb m3, m2
+ mova m5, m3
+ mova m4, %1
+ pand m3, m7
+ pand %1, m7
+ psrlw m5, 8
+ psrlw m4, 8
+ packuswb m3, %1
+ packuswb m5, m4
+ mova [%2], m3
+ mova [%3], m5
+ mova %1, m2
+%endmacro
+
+%macro FILT8x2U 3
+ mova m3, [r0+%3+8]
+ mova m2, [r0+%3]
+ pavgb m3, [r0+%3+r5+8]
+ pavgb m2, [r0+%3+r5]
+ mova m1, [r0+%3+9]
+ mova m0, [r0+%3+1]
+ pavgb m1, [r0+%3+r5+9]
+ pavgb m0, [r0+%3+r5+1]
+ pavgb m1, m3
+ pavgb m0, m2
+ mova m3, m1
+ mova m2, m0
+ pand m1, m7
+ pand m0, m7
+ psrlw m3, 8
+ psrlw m2, 8
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [%1], m0
+ mova [%2], m2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+; int src_stride, int dst_stride, int width, int height )
+;-----------------------------------------------------------------------------
+%macro FRAME_INIT_LOWRES 1 ; FIXME
+cglobal x264_frame_init_lowres_core_%1, 6,7
+ ; src += 2*(height-1)*stride + 2*width
+ mov r6d, r8m
+ dec r6d
+ imul r6d, r5d
+ add r6d, r7m
+ lea r0, [r0+r6*2]
+ ; dst += (height-1)*stride + width
+ mov r6d, r8m
+ dec r6d
+ imul r6d, r6m
+ add r6d, r7m
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ ; gap = stride - width
+ mov r6d, r6m
+ sub r6d, r7m
+ PUSH r6
+ %define dst_gap [rsp+push_size]
+ mov r6d, r5d
+ sub r6d, r7m
+ shl r6d, 1
+ PUSH r6
+ %define src_gap [rsp]
+%if regsize == 16
+ ; adjust for the odd end case
+ mov r6d, r7m
+ and r6d, 8
+ sub r1, r6
+ sub r2, r6
+ sub r3, r6
+ sub r4, r6
+ add dst_gap, r6d
+%endif ; regsize
+ pcmpeqb m7, m7
+ psrlw m7, 8
+.vloop:
+ mov r6d, r7m
+%ifnidn %1, mmxext
+ mova m0, [r0]
+ mova m1, [r0+r5]
+ pavgb m0, m1
+ pavgb m1, [r0+r5*2]
+%endif
+%if regsize == 16
+ test r6d, 8
+ jz .hloop
+ sub r0, 16
+ FILT8x4 m0, m1, m2, m3, m4, m5, 0
+ packuswb m0, m4
+ packuswb m1, m5
+ movq [r1], m0
+ movhps [r2], m0
+ movq [r3], m1
+ movhps [r4], m1
+ mova m0, m2
+ mova m1, m3
+ sub r6d, 8
+%endif ; regsize
+.hloop:
+ sub r0, regsize*2
+ sub r1, regsize
+ sub r2, regsize
+ sub r3, regsize
+ sub r4, regsize
+%ifdef m8
+ FILT8x4 m0, m1, m2, m3, m10, m11, regsize
+ mova m8, m0
+ mova m9, m1
+ FILT8x4 m2, m3, m0, m1, m4, m5, 0
+ packuswb m2, m8
+ packuswb m3, m9
+ packuswb m4, m10
+ packuswb m5, m11
+ mova [r1], m2
+ mova [r2], m4
+ mova [r3], m3
+ mova [r4], m5
+%elifidn %1, mmxext
+ FILT8x2U r1, r2, 0
+ FILT8x2U r3, r4, r5
+%else
+ FILT16x2 m0, r1, r2, 0
+ FILT16x2 m1, r3, r4, r5
+%endif
+ sub r6d, regsize
+ jg .hloop
+.skip:
+ mov r6, dst_gap
+ sub r0, src_gap
+ sub r1, r6
+ sub r2, r6
+ sub r3, r6
+ sub r4, r6
+ dec dword r8m
+ jg .vloop
+ ADD rsp, 2*push_size
+ RET
+%endmacro ; FRAME_INIT_LOWRES
+
+INIT_MMX
+%define PALIGNR PALIGNR_MMX
+FRAME_INIT_LOWRES mmxext
+%ifndef ARCH_X86_64
+FRAME_INIT_LOWRES cache32_mmxext
+%endif
+INIT_XMM
+FRAME_INIT_LOWRES sse2
+%define PALIGNR PALIGNR_SSSE3
+FRAME_INIT_LOWRES ssse3
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
extern void x264_memzero_aligned_mmx( void * dst, int n );
extern void x264_memzero_aligned_sse2( void * dst, int n );
+#define LOWRES(cpu) \
+extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
+ int src_stride, int dst_stride, int width, int height );
+LOWRES(mmxext)
+LOWRES(cache32_mmxext)
+LOWRES(sse2)
+LOWRES(ssse3)
#define PIXEL_AVG_W(width,cpu)\
extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
pf->plane_copy = x264_plane_copy_mmxext;
pf->hpel_filter = x264_hpel_filter_mmxext;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
{
pf->mc_luma = mc_luma_cache32_mmxext;
pf->get_ref = get_ref_cache32_mmxext;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
}
else if( cpu&X264_CPU_CACHELINE_64 )
{
pf->mc_luma = mc_luma_cache64_mmxext;
pf->get_ref = get_ref_cache64_mmxext;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
}
#endif
pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
}
pf->hpel_filter = x264_hpel_filter_sse2;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
return;
pf->hpel_filter = x264_hpel_filter_ssse3;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
}
DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
%define r7m [rsp + stack_offset + 16]
+%define r8m [rsp + stack_offset + 24]
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
%if %1 < %2
DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%define r7m [esp + stack_offset + 32]
+%define r8m [esp + stack_offset + 36]
%define rsp esp
%macro PUSH_IF_USED 1 ; reg_id
%define m5 xmm5
%define m6 xmm6
%define m7 xmm7
+ %ifdef ARCH_X86_64
%define m8 xmm8
%define m9 xmm9
%define m10 xmm10
%define m13 xmm13
%define m14 xmm14
%define m15 xmm15
+ %endif
%endmacro
INIT_MMX
%xdefine %1_m5 m5
%xdefine %1_m6 m6
%xdefine %1_m7 m7
+ %ifdef ARCH_X86_64
%xdefine %1_m8 m8
%xdefine %1_m9 m9
%xdefine %1_m10 m10
%xdefine %1_m13 m13
%xdefine %1_m14 m14
%xdefine %1_m15 m15
+ %endif
%endmacro
%macro LOAD_MM_PERMUTATION 1
%xdefine m5 %1_m5
%xdefine m6 %1_m6
%xdefine m7 %1_m7
+ %ifdef ARCH_X86_64
%xdefine m8 %1_m8
%xdefine m9 %1_m9
%xdefine m10 %1_m10
%xdefine m13 %1_m13
%xdefine m14 %1_m14
%xdefine m15 %1_m15
+ %endif
%endmacro
%macro call 1
cextern printf
; max number of args used by any x264 asm function.
-%define max_args 8
+; (max_args % 4) must equal 3 for stack alignment
+%define max_args 11
; just random numbers to reduce the chance of incidental match
%define n3 dword 0x6549315c
; long x264_checkasm_call( long (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
cglobal x264_checkasm_call, 1,7
- sub esp, 12
mov r3, n3
mov r4, n4
mov r5, n5
mov r6, n6
%rep max_args
- push dword [esp+36+max_args*4]
+ push dword [esp+24+max_args*4]
%endrep
call r0
- add esp, 12+max_args*4
+ add esp, max_args*4
xor r3, n3
xor r4, n4
xor r5, n5
report( "hpel filter :" );
}
+ if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
+ {
+ uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
+ uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
+ set_func_name( "lowres_init" );
+ for( w=40; w<=48; w+=8 )
+ if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
+ {
+ int stride = (w+8)&~15;
+ used_asm = 1;
+ call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
+ call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+ for( i=0; i<16; i++)
+ {
+ for( j=0; j<4; j++)
+ if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
+ {
+ ok = 0;
+ fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
+ for( k=0; k<w; k++ )
+ printf( "%d ", dstc[j][k+i*stride] );
+ printf("\n");
+ for( k=0; k<w; k++ )
+ printf( "%d ", dsta[j][k+i*stride] );
+ printf("\n");
+ break;
+ }
+ }
+ }
+ report( "lowres init :" );
+ }
+
return ret;
}