From: Loren Merritt Date: Sun, 29 Jun 2008 06:00:03 +0000 (-0600) Subject: lowres_init asm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=04dc25367d218c92ba85c4cae34cc8b36bab05a3;p=libx264 lowres_init asm rounding is changed for asm convenience. this makes the c version slower, but there's no way around that if all the implementations are to have the same results. --- diff --git a/common/frame.c b/common/frame.c index bdef1616..5d6de239 100644 --- a/common/frame.c +++ b/common/frame.c @@ -73,7 +73,7 @@ x264_frame_t *x264_frame_new( x264_t *h ) if( h->frames.b_have_lowres ) { frame->i_width_lowres = frame->i_width[0]/2; - frame->i_stride_lowres = frame->i_width_lowres + 2*PADH; + frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15; frame->i_lines_lowres = frame->i_lines[0]/2; for( i = 0; i < 4; i++ ) { diff --git a/common/mc.c b/common/mc.c index 513e47ae..2178f08e 100644 --- a/common/mc.c +++ b/common/mc.c @@ -283,6 +283,53 @@ static void memzero_aligned( void * dst, int n ) memset( dst, 0, n ); } +void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame ) +{ + uint8_t *src = frame->plane[0]; + int i_stride = frame->i_stride[0]; + int i_height = frame->i_lines[0]; + int i_width = frame->i_width[0]; + int x, y; + + // duplicate last row and column so that their interpolation doesn't have to be special-cased + for( y=0; ymc.memcpy_aligned( src+i_stride*i_height, src+i_stride*(i_height-1), i_width ); + h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3], + i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres ); + x264_frame_expand_border_lowres( frame ); + + for( y=0; y<16; y++ ) + for( x=0; x<16; x++ ) + frame->i_cost_est[y][x] = -1; +} + +static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + int src_stride, int dst_stride, int width, int height ) +{ + int x,y; + for( y=0; y>1)+((c+d+1)>>1)+1)>>1) + dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1], src1[2*x+1]); + dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]); + dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1], src2[2*x+1]); + dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]); +#undef FILTER + } + src0 += src_stride*2; + dst0 += dst_stride; + dsth += dst_stride; + dstv += dst_stride; + dstc += dst_stride; + } +} + void x264_mc_init( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma; @@ -322,6 +369,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) pf->prefetch_ref = prefetch_ref_null; pf->memcpy_aligned = memcpy; pf->memzero_aligned = memzero_aligned; + pf->frame_init_lowres_core = frame_init_lowres_core; #ifdef HAVE_MMX x264_mc_init_mmx( cpu, pf ); @@ -389,42 +437,3 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) } } } - -void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame ) -{ - // FIXME: tapfilter? - const int i_stride = frame->i_stride[0]; - const int i_stride2 = frame->i_stride_lowres; - const int i_width2 = frame->i_width_lowres; - int x, y, i; - for( y = 0; y < frame->i_lines_lowres - 1; y++ ) - { - uint8_t *src0 = &frame->plane[0][2*y*i_stride]; - uint8_t *src1 = src0+i_stride; - uint8_t *src2 = src1+i_stride; - uint8_t *dst0 = &frame->lowres[0][y*i_stride2]; - uint8_t *dsth = &frame->lowres[1][y*i_stride2]; - uint8_t *dstv = &frame->lowres[2][y*i_stride2]; - uint8_t *dstc = &frame->lowres[3][y*i_stride2]; - for( x = 0; x < i_width2 - 1; x++ ) - { - dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2; - dsth[x] = (src0[2*x+1] + src0[2*x+2] + src1[2*x+1] + src1[2*x+2] + 2) >> 2; - dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2; - dstc[x] = (src1[2*x+1] + src1[2*x+2] + src2[2*x+1] + src2[2*x+2] + 2) >> 2; - } - dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2; - dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2; - dsth[x] = (src0[2*x+1] + src1[2*x+1] + 1) >> 1; - dstc[x] = (src1[2*x+1] + src2[2*x+1] + 1) >> 1; - } - for( i = 0; i < 4; i++ ) - memcpy( &frame->lowres[i][y*i_stride2], &frame->lowres[i][(y-1)*i_stride2], i_width2 ); - - for( y = 0; y < 16; y++ ) - for( x = 0; x < 16; x++ ) - frame->i_cost_est[x][y] = -1; - - x264_frame_expand_border_lowres( frame ); -} - diff --git a/common/mc.h b/common/mc.h index 26f113f6..08331b81 100644 --- a/common/mc.h +++ b/common/mc.h @@ -69,6 +69,8 @@ typedef struct void *(*memcpy_aligned)( void *dst, const void *src, size_t n ); void (*memzero_aligned)( void *dst, int n ); + void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + int src_stride, int dst_stride, int width, int height ); } x264_mc_functions_t; void x264_mc_init( int cpu, x264_mc_functions_t *pf ); diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 606fa148..6f108cd4 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -86,13 +86,18 @@ SECTION .text packuswb m1, m4 %endmacro -%macro PALIGNR_SSE2 4 - %ifnidn %2, %4 - movdqa %4, %2 +%macro PALIGNR_MMX 4 + %ifnidn %4, %2 + mova %4, %2 %endif - pslldq %1, 16-%3 - psrldq %4, %3 - por %1, %4 + %if regsize == 8 + psllq %1, (8-%3)*8 + psrlq %4, %3*8 + %else + pslldq %1, 16-%3 + psrldq %4, %3 + %endif + por %1, %4 %endmacro %macro PALIGNR_SSSE3 4 @@ -306,7 +311,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3,1 jl .loop REP_RET -%define PALIGNR PALIGNR_SSE2 +%define PALIGNR PALIGNR_MMX HPEL_V sse2 HPEL_C sse2 %define PALIGNR PALIGNR_SSSE3 @@ -468,3 +473,185 @@ INIT_MMX MEMZERO mmx INIT_XMM MEMZERO sse2 + + + +%macro FILT8x4 7 + mova %3, [r0+%7] + mova %4, [r0+r5+%7] + pavgb %3, %4 + pavgb %4, [r0+r5*2+%7] + PALIGNR %1, %3, 1, m6 + PALIGNR %2, %4, 1, m6 + pavgb %1, %3 + pavgb %2, %4 + mova %5, %1 + mova %6, %2 + pand %1, m7 + pand %2, m7 + psrlw %5, 8 + psrlw %6, 8 +%endmacro + +%macro FILT16x2 4 + mova m3, [r0+%4+regsize] + mova m2, [r0+%4] + pavgb m3, [r0+%4+r5+regsize] + pavgb m2, [r0+%4+r5] + PALIGNR %1, m3, 1, m6 + pavgb %1, m3 + PALIGNR m3, m2, 1, m6 + pavgb m3, m2 + mova m5, m3 + mova m4, %1 + pand m3, m7 + pand %1, m7 + psrlw m5, 8 + psrlw m4, 8 + packuswb m3, %1 + packuswb m5, m4 + mova [%2], m3 + mova [%3], m5 + mova %1, m2 +%endmacro + +%macro FILT8x2U 3 + mova m3, [r0+%3+8] + mova m2, [r0+%3] + pavgb m3, [r0+%3+r5+8] + pavgb m2, [r0+%3+r5] + mova m1, [r0+%3+9] + mova m0, [r0+%3+1] + pavgb m1, [r0+%3+r5+9] + pavgb m0, [r0+%3+r5+1] + pavgb m1, m3 + pavgb m0, m2 + mova m3, m1 + mova m2, m0 + pand m1, m7 + pand m0, m7 + psrlw m3, 8 + psrlw m2, 8 + packuswb m0, m1 + packuswb m2, m3 + mova [%1], m0 + mova [%2], m2 +%endmacro + +;----------------------------------------------------------------------------- +; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, +; int src_stride, int dst_stride, int width, int height ) +;----------------------------------------------------------------------------- +%macro FRAME_INIT_LOWRES 1 ; FIXME +cglobal x264_frame_init_lowres_core_%1, 6,7 + ; src += 2*(height-1)*stride + 2*width + mov r6d, r8m + dec r6d + imul r6d, r5d + add r6d, r7m + lea r0, [r0+r6*2] + ; dst += (height-1)*stride + width + mov r6d, r8m + dec r6d + imul r6d, r6m + add r6d, r7m + add r1, r6 + add r2, r6 + add r3, r6 + add r4, r6 + ; gap = stride - width + mov r6d, r6m + sub r6d, r7m + PUSH r6 + %define dst_gap [rsp+push_size] + mov r6d, r5d + sub r6d, r7m + shl r6d, 1 + PUSH r6 + %define src_gap [rsp] +%if regsize == 16 + ; adjust for the odd end case + mov r6d, r7m + and r6d, 8 + sub r1, r6 + sub r2, r6 + sub r3, r6 + sub r4, r6 + add dst_gap, r6d +%endif ; regsize + pcmpeqb m7, m7 + psrlw m7, 8 +.vloop: + mov r6d, r7m +%ifnidn %1, mmxext + mova m0, [r0] + mova m1, [r0+r5] + pavgb m0, m1 + pavgb m1, [r0+r5*2] +%endif +%if regsize == 16 + test r6d, 8 + jz .hloop + sub r0, 16 + FILT8x4 m0, m1, m2, m3, m4, m5, 0 + packuswb m0, m4 + packuswb m1, m5 + movq [r1], m0 + movhps [r2], m0 + movq [r3], m1 + movhps [r4], m1 + mova m0, m2 + mova m1, m3 + sub r6d, 8 +%endif ; regsize +.hloop: + sub r0, regsize*2 + sub r1, regsize + sub r2, regsize + sub r3, regsize + sub r4, regsize +%ifdef m8 + FILT8x4 m0, m1, m2, m3, m10, m11, regsize + mova m8, m0 + mova m9, m1 + FILT8x4 m2, m3, m0, m1, m4, m5, 0 + packuswb m2, m8 + packuswb m3, m9 + packuswb m4, m10 + packuswb m5, m11 + mova [r1], m2 + mova [r2], m4 + mova [r3], m3 + mova [r4], m5 +%elifidn %1, mmxext + FILT8x2U r1, r2, 0 + FILT8x2U r3, r4, r5 +%else + FILT16x2 m0, r1, r2, 0 + FILT16x2 m1, r3, r4, r5 +%endif + sub r6d, regsize + jg .hloop +.skip: + mov r6, dst_gap + sub r0, src_gap + sub r1, r6 + sub r2, r6 + sub r3, r6 + sub r4, r6 + dec dword r8m + jg .vloop + ADD rsp, 2*push_size + RET +%endmacro ; FRAME_INIT_LOWRES + +INIT_MMX +%define PALIGNR PALIGNR_MMX +FRAME_INIT_LOWRES mmxext +%ifndef ARCH_X86_64 +FRAME_INIT_LOWRES cache32_mmxext +%endif +INIT_XMM +FRAME_INIT_LOWRES sse2 +%define PALIGNR PALIGNR_SSSE3 +FRAME_INIT_LOWRES ssse3 diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 75ce420d..fb0815aa 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -65,6 +65,13 @@ extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); extern void x264_memzero_aligned_mmx( void * dst, int n ); extern void x264_memzero_aligned_sse2( void * dst, int n ); +#define LOWRES(cpu) \ +extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\ + int src_stride, int dst_stride, int width, int height ); +LOWRES(mmxext) +LOWRES(cache32_mmxext) +LOWRES(sse2) +LOWRES(ssse3) #define PIXEL_AVG_W(width,cpu)\ extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int ); @@ -269,6 +276,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy = x264_plane_copy_mmxext; pf->hpel_filter = x264_hpel_filter_mmxext; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext; pf->prefetch_fenc = x264_prefetch_fenc_mmxext; pf->prefetch_ref = x264_prefetch_ref_mmxext; @@ -278,11 +286,13 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { pf->mc_luma = mc_luma_cache32_mmxext; pf->get_ref = get_ref_cache32_mmxext; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext; } else if( cpu&X264_CPU_CACHELINE_64 ) { pf->mc_luma = mc_luma_cache64_mmxext; pf->get_ref = get_ref_cache64_mmxext; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext; } #endif @@ -308,6 +318,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2; } pf->hpel_filter = x264_hpel_filter_sse2; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2; pf->mc_chroma = x264_mc_chroma_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) @@ -325,5 +336,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->hpel_filter = x264_hpel_filter_ssse3; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->mc_chroma = x264_mc_chroma_ssse3; } diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index e4470936..e52d542f 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -135,6 +135,7 @@ DECLARE_REG 4, r8, r8d, r8w, r8b, r8d DECLARE_REG 5, r9, r9d, r9w, r9b, r9d DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] %define r7m [rsp + stack_offset + 16] +%define r8m [rsp + stack_offset + 24] %macro LOAD_IF_USED 2 ; reg_id, number_of_args %if %1 < %2 @@ -167,6 +168,7 @@ DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] %define r7m [esp + stack_offset + 32] +%define r8m [esp + stack_offset + 36] %define rsp esp %macro PUSH_IF_USED 1 ; reg_id @@ -332,6 +334,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define m5 xmm5 %define m6 xmm6 %define m7 xmm7 + %ifdef ARCH_X86_64 %define m8 xmm8 %define m9 xmm9 %define m10 xmm10 @@ -340,6 +343,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %define m13 xmm13 %define m14 xmm14 %define m15 xmm15 + %endif %endmacro INIT_MMX @@ -389,6 +393,7 @@ INIT_MMX %xdefine %1_m5 m5 %xdefine %1_m6 m6 %xdefine %1_m7 m7 + %ifdef ARCH_X86_64 %xdefine %1_m8 m8 %xdefine %1_m9 m9 %xdefine %1_m10 m10 @@ -397,6 +402,7 @@ INIT_MMX %xdefine %1_m13 m13 %xdefine %1_m14 m14 %xdefine %1_m15 m15 + %endif %endmacro %macro LOAD_MM_PERMUTATION 1 @@ -408,6 +414,7 @@ INIT_MMX %xdefine m5 %1_m5 %xdefine m6 %1_m6 %xdefine m7 %1_m7 + %ifdef ARCH_X86_64 %xdefine m8 %1_m8 %xdefine m9 %1_m9 %xdefine m10 %1_m10 @@ -416,6 +423,7 @@ INIT_MMX %xdefine m13 %1_m13 %xdefine m14 %1_m14 %xdefine m15 %1_m15 + %endif %endmacro %macro call 1 diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm index b909cd0a..e67ad824 100644 --- a/tools/checkasm-a.asm +++ b/tools/checkasm-a.asm @@ -29,7 +29,8 @@ SECTION .text cextern printf ; max number of args used by any x264 asm function. -%define max_args 8 +; (max_args % 4) must equal 3 for stack alignment +%define max_args 11 ; just random numbers to reduce the chance of incidental match %define n3 dword 0x6549315c @@ -42,16 +43,15 @@ cextern printf ; long x264_checkasm_call( long (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- cglobal x264_checkasm_call, 1,7 - sub esp, 12 mov r3, n3 mov r4, n4 mov r5, n5 mov r6, n6 %rep max_args - push dword [esp+36+max_args*4] + push dword [esp+24+max_args*4] %endrep call r0 - add esp, 12+max_args*4 + add esp, max_args*4 xor r3, n3 xor r4, n4 xor r5, n5 diff --git a/tools/checkasm.c b/tools/checkasm.c index 3a8f30b3..89798342 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -761,6 +761,38 @@ static int check_mc( int cpu_ref, int cpu_new ) report( "hpel filter :" ); } + if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) + { + uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 }; + uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 }; + set_func_name( "lowres_init" ); + for( w=40; w<=48; w+=8 ) + if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core ) + { + int stride = (w+8)&~15; + used_asm = 1; + call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 ); + call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 ); + for( i=0; i<16; i++) + { + for( j=0; j<4; j++) + if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) ) + { + ok = 0; + fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i ); + for( k=0; k