From b5bc5d69c580429ff716bafcd43655e855c31b02 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sat, 30 Mar 2019 17:47:25 +0100 Subject: [PATCH] x86: Perform stack realignment in C instead of assembly Simplifies a lot of code and avoids having to export public asm functions. Note that the force_align_arg_pointer function attribute is broken in clang versions prior to 6.0.1 which may result in crashes, so make sure to either use a newer clang version or a different compiler. --- common/base.c | 60 ++++++++------------------------------------ common/cpu.h | 17 ------------- common/osdep.h | 6 +++++ common/threadpool.c | 7 +----- common/x86/cpu-a.asm | 48 +---------------------------------- encoder/api.c | 49 ++++++++++++++++-------------------- encoder/lookahead.c | 6 +---- tools/checkasm.c | 7 +----- x264.c | 7 +----- 9 files changed, 43 insertions(+), 164 deletions(-) diff --git a/common/base.c b/common/base.c index 1c208293..a637cff5 100644 --- a/common/base.c +++ b/common/base.c @@ -196,7 +196,7 @@ error: /**************************************************************************** * x264_picture_init: ****************************************************************************/ -static void picture_init( x264_picture_t *pic ) +REALIGN_STACK void x264_picture_init( x264_picture_t *pic ) { memset( pic, 0, sizeof( x264_picture_t ) ); pic->i_type = X264_TYPE_AUTO; @@ -204,15 +204,10 @@ static void picture_init( x264_picture_t *pic ) pic->i_pic_struct = PIC_STRUCT_AUTO; } -void x264_picture_init( x264_picture_t *pic ) -{ - x264_stack_align( picture_init, pic ); -} - /**************************************************************************** * x264_picture_alloc: ****************************************************************************/ -static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height ) +REALIGN_STACK int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height ) { typedef struct { @@ -243,7 +238,7 @@ static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_hei int csp = i_csp & X264_CSP_MASK; if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 ) return -1; - picture_init( pic ); + x264_picture_init( pic ); pic->img.i_csp = i_csp; pic->img.i_plane = csp_tab[csp].planes; int depth_factor = i_csp & X264_CSP_HIGH_DEPTH ? 2 : 1; @@ -265,15 +260,10 @@ static int picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_hei return 0; } -int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height ) -{ - return x264_stack_align( picture_alloc, pic, i_csp, i_width, i_height ); -} - /**************************************************************************** * x264_picture_clean: ****************************************************************************/ -static void picture_clean( x264_picture_t *pic ) +REALIGN_STACK void x264_picture_clean( x264_picture_t *pic ) { x264_free( pic->img.plane[0] ); @@ -281,15 +271,10 @@ static void picture_clean( x264_picture_t *pic ) memset( pic, 0, sizeof( x264_picture_t ) ); } -void x264_picture_clean( x264_picture_t *pic ) -{ - x264_stack_align( picture_clean, pic ); -} - /**************************************************************************** * x264_param_default: ****************************************************************************/ -static void param_default( x264_param_t *param ) +REALIGN_STACK void x264_param_default( x264_param_t *param ) { /* */ memset( param, 0, sizeof( x264_param_t ) ); @@ -434,11 +419,6 @@ static void param_default( x264_param_t *param ) param->i_avcintra_flavor = X264_AVCINTRA_FLAVOR_PANASONIC; } -void x264_param_default( x264_param_t *param ) -{ - x264_stack_align( param_default, param ); -} - static int param_apply_preset( x264_param_t *param, const char *preset ) { char *end; @@ -656,9 +636,9 @@ static int param_apply_tune( x264_param_t *param, const char *tune ) return 0; } -static int param_default_preset( x264_param_t *param, const char *preset, const char *tune ) +REALIGN_STACK int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune ) { - param_default( param ); + x264_param_default( param ); if( preset && param_apply_preset( param, preset ) < 0 ) return -1; @@ -667,12 +647,7 @@ static int param_default_preset( x264_param_t *param, const char *preset, const return 0; } -int x264_param_default_preset( x264_param_t *param, const char *preset, const char *tune ) -{ - return x264_stack_align( param_default_preset, param, preset, tune ); -} - -static void param_apply_fastfirstpass( x264_param_t *param ) +REALIGN_STACK void x264_param_apply_fastfirstpass( x264_param_t *param ) { /* Set faster options in case of turbo firstpass. */ if( param->rc.b_stat_write && !param->rc.b_stat_read ) @@ -687,11 +662,6 @@ static void param_apply_fastfirstpass( x264_param_t *param ) } } -void x264_param_apply_fastfirstpass( x264_param_t *param ) -{ - x264_stack_align( param_apply_fastfirstpass, param ); -} - static int profile_string_to_int( const char *str ) { if( !strcasecmp( str, "baseline" ) ) @@ -709,7 +679,7 @@ static int profile_string_to_int( const char *str ) return -1; } -static int param_apply_profile( x264_param_t *param, const char *profile ) +REALIGN_STACK int x264_param_apply_profile( x264_param_t *param, const char *profile ) { if( !profile ) return 0; @@ -776,11 +746,6 @@ static int param_apply_profile( x264_param_t *param, const char *profile ) return 0; } -int x264_param_apply_profile( x264_param_t *param, const char *profile ) -{ - return x264_stack_align( param_apply_profile, param, profile ); -} - static int parse_enum( const char *arg, const char * const *names, int *dst ) { for( int i = 0; names[i]; i++ ) @@ -842,7 +807,7 @@ static double atof_internal( const char *str, int *b_error ) #define atoi(str) atoi_internal( str, &b_error ) #define atof(str) atof_internal( str, &b_error ) -static int param_parse( x264_param_t *p, const char *name, const char *value ) +REALIGN_STACK int x264_param_parse( x264_param_t *p, const char *name, const char *value ) { char *name_buf = NULL; int b_error = 0; @@ -1343,11 +1308,6 @@ static int param_parse( x264_param_t *p, const char *name, const char *value ) return b_error ? errortype : 0; } -int x264_param_parse( x264_param_t *param, const char *name, const char *value ) -{ - return x264_stack_align( param_parse, param, name, value ); -} - /**************************************************************************** * x264_param2string: ****************************************************************************/ diff --git a/common/cpu.h b/common/cpu.h index f69f2e12..a8c480bb 100644 --- a/common/cpu.h +++ b/common/cpu.h @@ -46,23 +46,6 @@ void x264_cpu_sfence( void ); #endif #define x264_sfence x264_cpu_sfence -/* kludge: - * gcc can't give variables any greater alignment than the stack frame has. - * We need 32 byte alignment for AVX2, so here we make sure that the stack is - * aligned to 32 bytes. - * gcc 4.2 introduced __attribute__((force_align_arg_pointer)) to fix this - * problem, but I don't want to require such a new version. - * aligning to 32 bytes only works if the compiler supports keeping that - * alignment between functions (osdep.h handles manual alignment of arrays - * if it doesn't). - */ -#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4)) -intptr_t x264_stack_align( void (*func)(), ... ); -#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__) -#else -#define x264_stack_align(func,...) func(__VA_ARGS__) -#endif - typedef struct { const char *name; diff --git a/common/osdep.h b/common/osdep.h index 90067210..c48a1bb2 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -163,6 +163,12 @@ int x264_is_pipe( const char *path ); #define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16 #endif +#if STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4) +#define REALIGN_STACK __attribute__((force_align_arg_pointer)) +#else +#define REALIGN_STACK +#endif + #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) #define UNUSED __attribute__((unused)) #define ALWAYS_INLINE __attribute__((always_inline)) inline diff --git a/common/threadpool.c b/common/threadpool.c index 0d2be8e3..7fe34564 100644 --- a/common/threadpool.c +++ b/common/threadpool.c @@ -47,7 +47,7 @@ struct x264_threadpool_t x264_sync_frame_list_t done; /* list of jobs that have finished processing */ }; -static void *threadpool_thread_internal( x264_threadpool_t *pool ) +REALIGN_STACK static void *threadpool_thread( x264_threadpool_t *pool ) { if( pool->init_func ) pool->init_func( pool->init_arg ); @@ -72,11 +72,6 @@ static void *threadpool_thread_internal( x264_threadpool_t *pool ) return NULL; } -static void *threadpool_thread( x264_threadpool_t *pool ) -{ - return (void*)x264_stack_align( threadpool_thread_internal, pool ); -} - int x264_threadpool_init( x264_threadpool_t **p_pool, int threads, void (*init_func)(void *), void *init_arg ) { diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm index d84778db..f95a43a3 100644 --- a/common/x86/cpu-a.asm +++ b/common/x86/cpu-a.asm @@ -78,33 +78,7 @@ cglobal cpu_sfence sfence ret -%if ARCH_X86_64 - -;----------------------------------------------------------------------------- -; intptr_t stack_align( void (*func)(void*), ... ); (up to 5 args) -;----------------------------------------------------------------------------- -cvisible stack_align - mov rax, r0mp - mov r0, r1mp - mov r1, r2mp - mov r2, r3mp - mov r3, r4mp - mov r4, r5mp - push rbp - mov rbp, rsp -%if WIN64 - sub rsp, 40 ; shadow space + r4 -%endif - and rsp, ~(STACK_ALIGNMENT-1) -%if WIN64 - mov [rsp+32], r4 -%endif - call rax - leave - ret - -%else - +%if ARCH_X86_64 == 0 ;----------------------------------------------------------------------------- ; int cpu_cpuid_test( void ) ; return 0 if unsupported @@ -130,24 +104,4 @@ cglobal cpu_cpuid_test pop ebx popfd ret - -cvisible stack_align - push ebp - mov ebp, esp - sub esp, 20 - and esp, ~(STACK_ALIGNMENT-1) - mov r0, [ebp+12] - mov r1, [ebp+16] - mov r2, [ebp+20] - mov [esp+ 0], r0 - mov [esp+ 4], r1 - mov [esp+ 8], r2 - mov r0, [ebp+24] - mov r1, [ebp+28] - mov [esp+12], r0 - mov [esp+16], r1 - call [ebp+ 8] - leave - ret - %endif diff --git a/encoder/api.c b/encoder/api.c index 44fde51f..e50578fd 100644 --- a/encoder/api.c +++ b/encoder/api.c @@ -73,7 +73,7 @@ typedef struct x264_api_t int (*encoder_invalidate_reference)( x264_t *, int64_t pts ); } x264_api_t; -static x264_api_t *encoder_open( x264_param_t *param ) +REALIGN_STACK x264_t *x264_encoder_open( x264_param_t *param ) { x264_api_t *api = calloc( 1, sizeof( x264_api_t ) ); if( !api ) @@ -118,82 +118,77 @@ static x264_api_t *encoder_open( x264_param_t *param ) return NULL; } - return api; -} - -x264_t *x264_encoder_open( x264_param_t *param ) -{ /* x264_t is opaque */ - return (x264_t *)x264_stack_align( encoder_open, param ); + return (x264_t *)api; } -void x264_encoder_close( x264_t *h ) +REALIGN_STACK void x264_encoder_close( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; - x264_stack_align( api->encoder_close, api->x264 ); + api->encoder_close( api->x264 ); free( api ); } -void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ) +REALIGN_STACK void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ) { x264_api_t *api = (x264_api_t *)h; - x264_stack_align( api->nal_encode, api->x264, dst, nal ); + api->nal_encode( api->x264, dst, nal ); } -int x264_encoder_reconfig( x264_t *h, x264_param_t *param) +REALIGN_STACK int x264_encoder_reconfig( x264_t *h, x264_param_t *param) { x264_api_t *api = (x264_api_t *)h; - return x264_stack_align( api->encoder_reconfig, api->x264, param ); + return api->encoder_reconfig( api->x264, param ); } -void x264_encoder_parameters( x264_t *h, x264_param_t *param ) +REALIGN_STACK void x264_encoder_parameters( x264_t *h, x264_param_t *param ) { x264_api_t *api = (x264_api_t *)h; - x264_stack_align( api->encoder_parameters, api->x264, param ); + api->encoder_parameters( api->x264, param ); } -int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal ) +REALIGN_STACK int x264_encoder_headers( x264_t *h, x264_nal_t **pp_nal, int *pi_nal ) { x264_api_t *api = (x264_api_t *)h; - return x264_stack_align( api->encoder_headers, api->x264, pp_nal, pi_nal ); + return api->encoder_headers( api->x264, pp_nal, pi_nal ); } -int x264_encoder_encode( x264_t *h, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ) +REALIGN_STACK int x264_encoder_encode( x264_t *h, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ) { x264_api_t *api = (x264_api_t *)h; - return x264_stack_align( api->encoder_encode, api->x264, pp_nal, pi_nal, pic_in, pic_out ); + return api->encoder_encode( api->x264, pp_nal, pi_nal, pic_in, pic_out ); } -int x264_encoder_delayed_frames( x264_t *h ) +REALIGN_STACK int x264_encoder_delayed_frames( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; - return x264_stack_align( api->encoder_delayed_frames, api->x264 ); + return api->encoder_delayed_frames( api->x264 ); } -int x264_encoder_maximum_delayed_frames( x264_t *h ) +REALIGN_STACK int x264_encoder_maximum_delayed_frames( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; - return x264_stack_align( api->encoder_maximum_delayed_frames, api->x264 ); + return api->encoder_maximum_delayed_frames( api->x264 ); } -void x264_encoder_intra_refresh( x264_t *h ) +REALIGN_STACK void x264_encoder_intra_refresh( x264_t *h ) { x264_api_t *api = (x264_api_t *)h; - x264_stack_align( api->encoder_intra_refresh, api->x264 ); + api->encoder_intra_refresh( api->x264 ); } -int x264_encoder_invalidate_reference( x264_t *h, int64_t pts ) +REALIGN_STACK int x264_encoder_invalidate_reference( x264_t *h, int64_t pts ) { x264_api_t *api = (x264_api_t *)h; - return x264_stack_align( api->encoder_invalidate_reference, api->x264, pts ); + return api->encoder_invalidate_reference( api->x264, pts ); } diff --git a/encoder/lookahead.c b/encoder/lookahead.c index 227c8fd3..61691e1f 100644 --- a/encoder/lookahead.c +++ b/encoder/lookahead.c @@ -87,7 +87,7 @@ static void lookahead_slicetype_decide( x264_t *h ) x264_pthread_mutex_unlock( &h->lookahead->ofbuf.mutex ); } -static void *lookahead_thread_internal( x264_t *h ) +REALIGN_STACK static void *lookahead_thread( x264_t *h ) { while( !h->lookahead->b_exit_thread ) { @@ -122,10 +122,6 @@ static void *lookahead_thread_internal( x264_t *h ) return NULL; } -static void *lookahead_thread( x264_t *h ) -{ - return (void*)x264_stack_align( lookahead_thread_internal, h ); -} #endif int x264_lookahead_init( x264_t *h, int i_slicetype_length ) diff --git a/tools/checkasm.c b/tools/checkasm.c index ea636692..36d73558 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -2914,7 +2914,7 @@ static int check_all_flags( void ) return ret; } -static int main_internal( int argc, char **argv ) +REALIGN_STACK int main( int argc, char **argv ) { #ifdef _WIN32 /* Disable the Windows Error Reporting dialog */ @@ -2973,8 +2973,3 @@ static int main_internal( int argc, char **argv ) print_bench(); return 0; } - -int main( int argc, char **argv ) -{ - return x264_stack_align( main_internal, argc, argv ); -} diff --git a/x264.c b/x264.c index f5e37a6c..d5cbf34b 100644 --- a/x264.c +++ b/x264.c @@ -373,7 +373,7 @@ static void print_version_info( void ) #endif } -static int main_internal( int argc, char **argv ) +REALIGN_STACK int main( int argc, char **argv ) { if( argc == 4 && !strcmp( argv[1], "--autocomplete" ) ) return x264_cli_autocomplete( argv[2], argv[3] ); @@ -428,11 +428,6 @@ static int main_internal( int argc, char **argv ) return ret; } -int main( int argc, char **argv ) -{ - return x264_stack_align( main_internal, argc, argv ); -} - static char const *strtable_lookup( const char * const table[], int idx ) { int i = 0; while( table[i] ) i++; -- 2.40.0