From: Henrik Gramner Date: Wed, 1 Feb 2012 22:52:48 +0000 (+0100) Subject: Fix incorrect zero-extension assumptions in x86_64 asm X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3131a19cabcdca221ce4cd61a3cff68d99f1a517;p=libx264 Fix incorrect zero-extension assumptions in x86_64 asm Some x264 asm assumed that the high 32 bits of registers containing "int" values would be zero. This is almost always the case, and it seems to work with gcc, but it is *not* guaranteed by the ABI. As a result, it breaks with some other compilers, like Clang, that take advantage of this in optimizations. Accordingly, fix all x86 code by using intptr_t instead of int or using movsxd where neccessary. Also add checkasm hack to detect when assembly functions incorrectly assumes that 32-bit integers are zero-extended to 64-bit. --- diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 8ed0a227..507bbba1 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -32,7 +32,7 @@ // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8 // They also use nothing above armv5te, but we don't care about pre-armv6 -// void prefetch_ref( uint8_t *pix, int stride, int parity ) +// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity ) function x264_prefetch_ref_arm sub r2, r2, #1 add r0, r0, #64 @@ -51,8 +51,8 @@ function x264_prefetch_ref_arm bx lr .endfunc -// void prefetch_fenc( uint8_t *pix_y, int stride_y, -// uint8_t *pix_uv, int stride_uv, int mb_x ) +// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, +// uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) function x264_prefetch_fenc_arm ldr ip, [sp] push {lr} @@ -78,7 +78,7 @@ function x264_prefetch_fenc_arm .endfunc -// void *x264_memcpy_aligned( void * dst, const void * src, size_t n ) +// void *x264_memcpy_aligned( void *dst, const void *src, size_t n ) function x264_memcpy_aligned_neon orr r3, r0, r1, lsr #1 movrel ip, memcpy_table @@ -158,9 +158,9 @@ memzero_loop: .endfunc -// void pixel_avg( uint8_t *dst, int dst_stride, -// uint8_t *src1, int src1_stride, -// uint8_t *src2, int src2_stride, int weight ); +// void pixel_avg( uint8_t *dst, intptr_t dst_stride, +// uint8_t *src1, intptr_t src1_stride, +// uint8_t *src2, intptr_t src2_stride, int weight ); .macro AVGH w h function x264_pixel_avg_\w\()x\h\()_neon ldr ip, [sp, #8] @@ -455,7 +455,7 @@ avg2_w20_loop: .endif .endm -// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, +// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride, // const x264_weight_t *weight, int height ) function x264_mc_weight_w20_neon weight_prologue full @@ -744,7 +744,7 @@ weight_simple offsetadd, vqadd.u8 weight_simple offsetsub, vqsub.u8 -// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height ) +// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height ) function x264_mc_copy_w4_neon ldr ip, [sp] copy_w4_loop: @@ -810,8 +810,8 @@ copy_w16_aligned_loop: .endfunc -// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride, -// uint8_t *src, int i_src_stride, +// void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride, +// uint8_t *src, intptr_t i_src_stride, // int dx, int dy, int i_width, int i_height ); function x264_mc_chroma_neon push {r4-r6, lr} @@ -1052,7 +1052,7 @@ mc_chroma_w8: .endfunc -// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width) +// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width ) function x264_hpel_filter_v_neon ldr ip, [sp] sub r1, r1, r3, lsl #1 @@ -1266,7 +1266,7 @@ filter_h_loop: // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, -// uint8_t *dstc, int src_stride, int dst_stride, int width, +// uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width, // int height ) function x264_frame_init_lowres_core_neon push {r4-r10,lr} diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index de4aec4e..1148ae76 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -26,33 +26,33 @@ #include "common/common.h" #include "mc.h" -void x264_prefetch_ref_arm( uint8_t *, int, int ); -void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int ); - -void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n ); -void x264_memzero_aligned_neon( void *dst, int n ); - -void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); -void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); - -void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); -void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +void x264_prefetch_ref_arm( uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_neon( void *dst, size_t n ); + +void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); #define MC_WEIGHT(func)\ -void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\ -void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\ -void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\ -void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\ +void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ \ -static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\ +static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ {\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w4##func##_neon,\ @@ -67,15 +67,15 @@ MC_WEIGHT(_nodenom) MC_WEIGHT(_offsetadd) MC_WEIGHT(_offsetsub) -void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int ); -void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int ); -void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int ); -void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); -void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int ); -void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int); +void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); -void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int ); +void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int ); void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int ); void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int ); @@ -101,7 +101,7 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) w->weightfn = x264_mc_wtab_neon; } -static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) = +static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = { NULL, x264_pixel_avg2_w4_neon, @@ -111,7 +111,7 @@ static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, in x264_pixel_avg2_w20_neon, }; -static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) = +static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = { NULL, x264_mc_copy_w4_neon, @@ -123,13 +123,13 @@ static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; -static void mc_luma_neon( uint8_t *dst, int i_dst_stride, - uint8_t *src[4], int i_src_stride, +static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); - int offset = (mvy>>2)*i_src_stride + (mvx>>2); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; if ( (mvy&3) == 3 ) // explict if() to force conditional add src1 += i_src_stride; @@ -149,13 +149,13 @@ static void mc_luma_neon( uint8_t *dst, int i_dst_stride, x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); } -static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride, - uint8_t *src[4], int i_src_stride, +static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); - int offset = (mvy>>2)*i_src_stride + (mvx>>2); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; if ( (mvy&3) == 3 ) // explict if() to force conditional add src1 += i_src_stride; @@ -183,9 +183,9 @@ static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride, } static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, - int stride, int width, int height, int16_t *buf ) + intptr_t stride, int width, int height, int16_t *buf ) { - int realign = (intptr_t)src & 15; + intptr_t realign = (intptr_t)src & 15; src -= realign; dstv -= realign; dstc -= realign; diff --git a/common/arm/pixel.h b/common/arm/pixel.h index d0c90dae..ba390112 100644 --- a/common/arm/pixel.h +++ b/common/arm/pixel.h @@ -39,11 +39,11 @@ DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) ) #define DECL_X4( name, suffix ) \ - DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\ - DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) ) + DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) ) -int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t ); DECL_X1( sad, neon ) DECL_X1( sad_aligned, neon ) @@ -52,21 +52,21 @@ DECL_X4( sad, neon ) DECL_X1( satd, neon ) DECL_X1( ssd, neon ) -int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); -uint64_t x264_pixel_var_8x8_neon( uint8_t *, int ); -uint64_t x264_pixel_var_16x16_neon( uint8_t *, int ); -int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * ); +uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); +int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int ); -uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int ); -uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int ); -uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int ); +uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t ); -void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int, - const uint8_t *, int, - int sums[2][4]); +void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, + const uint8_t *, intptr_t, + int sums[2][4] ); float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); #endif diff --git a/common/deblock.c b/common/deblock.c index 922b076d..51f0d7a8 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -76,7 +76,7 @@ static const int8_t i_tc0_table[52+12*3][4] = #define tc0_table(x) i_tc0_table[(x)+24] /* From ffmpeg */ -static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc0 ) +static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 ) { int p2 = pix[-3*xstride]; int p1 = pix[-2*xstride]; @@ -107,7 +107,7 @@ static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alph pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } } -static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) +static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) { @@ -120,21 +120,21 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] ); } } -static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { for( int d = 0; d < 8; d++, pix += stride ) deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] ); } -static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_luma_c( pix, stride, 1, alpha, beta, tc0 ); } -static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_luma_c( pix, 1, stride, alpha, beta, tc0 ); } -static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc ) +static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -148,7 +148,7 @@ static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int al pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */ } } -static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) +static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 ) { for( int i = 0; i < 4; i++ ) { @@ -163,24 +163,24 @@ static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] ); } } -static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 ); } -static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 ); } -static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 ); } -static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 ); } -static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta ) +static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta ) { int p2 = pix[-3*xstride]; int p1 = pix[-2*xstride]; @@ -219,26 +219,26 @@ static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, in } } } -static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta ) +static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < 16; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, xstride, alpha, beta ); } -static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta ) +static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < 8; d++, pix += ystride ) deblock_edge_luma_intra_c( pix, 1, alpha, beta ); } -static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_luma_intra_c( pix, stride, 1, alpha, beta ); } -static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_luma_intra_c( pix, 1, stride, alpha, beta ); } -static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, int alpha, int beta ) +static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta ) { int p1 = pix[-2*xstride]; int p0 = pix[-1*xstride]; @@ -251,25 +251,25 @@ static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */ } } -static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta ) +static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta ) { for( int d = 0; d < height; d++, pix += ystride-2 ) for( int e = 0; e < width; e++, pix++ ) deblock_edge_chroma_intra_c( pix, xstride, alpha, beta ); } -static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta ); } -static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta ); } -static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta ); } -static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta ) +static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) { deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta ); } @@ -303,7 +303,8 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264 } } -static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_inter_t pf_inter ) +static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp, + int a, int b, int b_chroma, x264_deblock_inter_t pf_inter ) { int index_a = i_qp + a; int index_b = i_qp + b; @@ -322,7 +323,8 @@ static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uin pf_inter( pix, i_stride, alpha, beta, tc ); } -static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_intra_t pf_intra ) +static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp, + int a, int b, int b_chroma, x264_deblock_intra_t pf_intra ) { int index_a = i_qp + a; int index_b = i_qp + b; @@ -631,30 +633,30 @@ void x264_macroblock_deblock( x264_t *h ) } #if HAVE_MMX -void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); @@ -668,32 +670,32 @@ void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X2 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); -void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); #if ARCH_X86 -void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta ); -void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta ); -void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #if HIGH_BIT_DEPTH -void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_v_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta ); +void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); #else // FIXME this wrapper has a significant cpu cost -static void x264_deblock_v_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +static void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 ); x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 ); } -static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta ) +static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta ) { x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta ); x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta ); @@ -703,15 +705,15 @@ static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, #endif #if ARCH_PPC -void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); -void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif // ARCH_PPC #if HAVE_ARMV6 -void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * ); -void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * ); -void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * ); -void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * ); +void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) diff --git a/common/frame.c b/common/frame.c index e13e5097..21d13476 100644 --- a/common/frame.c +++ b/common/frame.c @@ -710,8 +710,8 @@ x264_frame_t *x264_frame_pop_blank_unused( x264_t *h ) return frame; } -void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, - int i_width, int i_height, x264_weight_t *w ) +void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, + int i_width, int i_height, x264_weight_t *w ) { /* Weight horizontal strips of height 16. This was found to be the optimal height * in terms of the cache loads. */ diff --git a/common/frame.h b/common/frame.h index 94e875d2..54415f7f 100644 --- a/common/frame.h +++ b/common/frame.h @@ -178,8 +178,8 @@ typedef struct x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */ } x264_sync_frame_list_t; -typedef void (*x264_deblock_inter_t)( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 ); -typedef void (*x264_deblock_intra_t)( pixel *pix, int stride, int alpha, int beta ); +typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta ); typedef struct { x264_deblock_inter_t deblock_luma[2]; @@ -232,7 +232,7 @@ x264_frame_t *x264_frame_shift( x264_frame_t **list ); void x264_frame_push_unused( x264_t *h, x264_frame_t *frame ); void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame ); x264_frame_t *x264_frame_pop_blank_unused( x264_t *h ); -void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, +void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, int i_width, int i_height, x264_weight_t *w ); x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec ); void x264_frame_delete_list( x264_frame_t **list ); diff --git a/common/macroblock.c b/common/macroblock.c index d600f82d..b4e6d951 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -121,7 +121,7 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int i_mode = x264_size2pixel[height][width]; - int i_stride0 = 16, i_stride1 = 16; + intptr_t i_stride0 = 16, i_stride1 = 16; ALIGNED_ARRAY_16( pixel, tmp0,[16*16] ); ALIGNED_ARRAY_16( pixel, tmp1,[16*16] ); pixel *src0, *src1; diff --git a/common/mc.c b/common/mc.c index 88ed6ea8..86f7e35a 100644 --- a/common/mc.c +++ b/common/mc.c @@ -37,10 +37,9 @@ #endif -static inline void pixel_avg( pixel *dst, int i_dst_stride, - pixel *src1, int i_src1_stride, - pixel *src2, int i_src2_stride, - int i_width, int i_height ) +static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride, + pixel *src1, intptr_t i_src1_stride, + pixel *src2, intptr_t i_src2_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { @@ -52,7 +51,9 @@ static inline void pixel_avg( pixel *dst, int i_dst_stride, } } -static inline void pixel_avg_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height ) +static inline void pixel_avg_wxh( pixel *dst, intptr_t i_dst, + pixel *src1, intptr_t i_src1, + pixel *src2, intptr_t i_src2, int width, int height ) { for( int y = 0; y < height; y++ ) { @@ -66,9 +67,11 @@ static inline void pixel_avg_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1 /* Implicit weighted bipred only: * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */ -static inline void pixel_avg_weight_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height, int i_weight1 ) +static inline void pixel_avg_weight_wxh( pixel *dst, intptr_t i_dst, + pixel *src1, intptr_t i_src1, + pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 ) { - const int i_weight2 = 64 - i_weight1; + int i_weight2 = 64 - i_weight1; for( int y = 0; y> 6 ); @@ -76,9 +79,9 @@ static inline void pixel_avg_weight_wxh( pixel *dst, int i_dst, pixel *src1, int #undef op_scale2 #define PIXEL_AVG_C( name, width, height ) \ -static void name( pixel *pix1, int i_stride_pix1, \ - pixel *pix2, int i_stride_pix2, \ - pixel *pix3, int i_stride_pix3, int weight ) \ +static void name( pixel *pix1, intptr_t i_stride_pix1, \ + pixel *pix2, intptr_t i_stride_pix2, \ + pixel *pix3, intptr_t i_stride_pix3, int weight ) \ { \ if( weight == 32 ) \ pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \ @@ -104,7 +107,8 @@ static void x264_weight_cache( x264_t *h, x264_weight_t *w ) } #define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset ) #define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset ) -static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height ) +static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, + const x264_weight_t *weight, int i_width, int i_height ) { int offset = weight->i_offset << (BIT_DEPTH-8); int scale = weight->i_scale; @@ -124,7 +128,7 @@ static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_strid } #define MC_WEIGHT_C( name, width ) \ - static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \ + static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \ { \ mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\ } @@ -146,7 +150,7 @@ static weight_fn_t x264_mc_weight_wtab[6] = mc_weight_w20, }; const x264_weight_t x264_weight_none[3] = { {{0}} }; -static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, int i_width, int i_height ) +static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height ) { for( int y = 0; y < i_height; y++ ) { @@ -159,7 +163,7 @@ static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d])) static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, - int stride, int width, int height, int16_t *buf ) + intptr_t stride, int width, int height, int16_t *buf ) { const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0; for( int y = 0; y < height; y++ ) @@ -185,8 +189,8 @@ static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; -static void mc_luma( pixel *dst, int i_dst_stride, - pixel *src[4], int i_src_stride, +static void mc_luma( pixel *dst, intptr_t i_dst_stride, + pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { @@ -208,8 +212,8 @@ static void mc_luma( pixel *dst, int i_dst_stride, mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height ); } -static pixel *get_ref( pixel *dst, int *i_dst_stride, - pixel *src[4], int i_src_stride, +static pixel *get_ref( pixel *dst, intptr_t *i_dst_stride, + pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { @@ -239,8 +243,8 @@ static pixel *get_ref( pixel *dst, int *i_dst_stride, } /* full chroma mc (ie until 1/8 pixel)*/ -static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride, - pixel *src, int i_src_stride, +static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride, + pixel *src, intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height ) { @@ -273,7 +277,7 @@ static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride, } #define MC_COPY(W) \ -static void mc_copy_w##W( pixel *dst, int i_dst, pixel *src, int i_src, int i_height ) \ +static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \ { \ mc_copy( src, i_src, dst, i_dst, W, i_height ); \ } @@ -281,8 +285,8 @@ MC_COPY( 16 ) MC_COPY( 8 ) MC_COPY( 4 ) -void x264_plane_copy_c( pixel *dst, int i_dst, - pixel *src, int i_src, int w, int h ) +void x264_plane_copy_c( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ) { while( h-- ) { @@ -292,9 +296,9 @@ void x264_plane_copy_c( pixel *dst, int i_dst, } } -void x264_plane_copy_interleave_c( pixel *dst, int i_dst, - pixel *srcu, int i_srcu, - pixel *srcv, int i_srcv, int w, int h ) +void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ) { for( int y=0; yssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v ); if( i_width&7 ) @@ -171,7 +174,7 @@ void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pi * pixel_var_wxh ****************************************************************************/ #define PIXEL_VAR_C( name, w, h ) \ -static uint64_t name( pixel *pix, int i_stride ) \ +static uint64_t name( pixel *pix, intptr_t i_stride ) \ { \ uint32_t sum = 0, sqr = 0; \ for( int y = 0; y < h; y++ ) \ @@ -194,7 +197,7 @@ PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 ) * pixel_var2_wxh ****************************************************************************/ #define PIXEL_VAR2_C( name, w, h, shift ) \ -static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \ +static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \ { \ uint32_t var = 0, sum = 0, sqr = 0; \ for( int y = 0; y < h; y++ ) \ @@ -249,7 +252,7 @@ static ALWAYS_INLINE sum2_t abs2( sum2_t a ) * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences ****************************************************************************/ -static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) +static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { sum2_t tmp[4][2]; sum2_t a0, a1, a2, a3, b0, b1; @@ -274,7 +277,7 @@ static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, i return sum >> 1; } -static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) +static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { sum2_t tmp[4][4]; sum2_t a0, a1, a2, a3; @@ -296,7 +299,7 @@ static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, i } #define PIXEL_SATD_C( w, h, sub )\ -static int x264_pixel_satd_##w##x##h( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )\ +static int x264_pixel_satd_##w##x##h( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )\ {\ int sum = sub( pix1, i_pix1, pix2, i_pix2 )\ + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\ @@ -318,7 +321,7 @@ PIXEL_SATD_C( 8, 8, x264_pixel_satd_8x4 ) PIXEL_SATD_C( 4, 16, x264_pixel_satd_4x4 ) PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 ) -static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) +static NOINLINE int sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { sum2_t tmp[8][4]; sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3; @@ -352,13 +355,13 @@ static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) return sum; } -static int x264_pixel_sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) +static int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 ); return (sum+2)>>2; } -static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) +static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) { int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 ) + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 ) @@ -368,7 +371,7 @@ static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pi } -static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride ) +static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride ) { sum2_t tmp[32]; sum2_t a0, a1, a2, a3, dc; @@ -406,7 +409,7 @@ static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride ) } #define HADAMARD_AC(w,h) \ -static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, int stride )\ +static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, intptr_t stride )\ {\ uint64_t sum = pixel_hadamard_ac( pix, stride );\ if( w==16 )\ @@ -427,13 +430,15 @@ HADAMARD_AC( 8, 8 ) * pixel_sad_x4 ****************************************************************************/ #define SAD_X( size ) \ -static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\ +static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\ + intptr_t i_stride, int scores[3] )\ {\ scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\ }\ -static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\ +static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3,\ + intptr_t i_stride, int scores[4] )\ {\ scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\ @@ -464,13 +469,15 @@ SAD_X( 8x8_vis ) ****************************************************************************/ #define SATD_X( size, cpu ) \ -static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\ +static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\ + intptr_t i_stride, int scores[3] )\ {\ scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\ }\ -static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\ +static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3,\ + intptr_t i_stride, int scores[4] )\ {\ scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\ @@ -576,9 +583,9 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 ) /**************************************************************************** * structural similarity metric ****************************************************************************/ -static void ssim_4x4x2_core( const pixel *pix1, int stride1, - const pixel *pix2, int stride2, - int sums[2][4]) +static void ssim_4x4x2_core( const pixel *pix1, intptr_t stride1, + const pixel *pix2, intptr_t stride2, + int sums[2][4] ) { for( int z = 0; z < 2; z++ ) { @@ -640,8 +647,8 @@ static float ssim_end4( int sum0[5][4], int sum1[5][4], int width ) } float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, - pixel *pix1, int stride1, - pixel *pix2, int stride2, + pixel *pix1, intptr_t stride1, + pixel *pix2, intptr_t stride2, int width, int height, void *buf, int *cnt ) { int z = 0; @@ -665,7 +672,7 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, return ssim; } -static int pixel_vsad( pixel *src, int stride, int height ) +static int pixel_vsad( pixel *src, intptr_t stride, int height ) { int score = 0; for( int i = 1; i < height; i++, src += stride ) diff --git a/common/pixel.h b/common/pixel.h index b3935726..50589137 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -30,9 +30,9 @@ // SSD assumes all args aligned // other cmp functions assume first arg aligned -typedef int (*x264_pixel_cmp_t) ( pixel *, int, pixel *, int ); -typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, int, int[3] ); -typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int[4] ); +typedef int (*x264_pixel_cmp_t) ( pixel *, intptr_t, pixel *, intptr_t ); +typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, intptr_t, int[3] ); +typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int[4] ); enum { @@ -88,18 +88,18 @@ typedef struct x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */ - int (*vsad)( pixel *, int, int ); + int (*vsad)( pixel *, intptr_t, int ); - uint64_t (*var[4])( pixel *pix, int stride ); - int (*var2[4])( pixel *pix1, int stride1, - pixel *pix2, int stride2, int *ssd ); - uint64_t (*hadamard_ac[4])( pixel *pix, int stride ); + uint64_t (*var[4])( pixel *pix, intptr_t stride ); + int (*var2[4])( pixel *pix1, intptr_t stride1, + pixel *pix2, intptr_t stride2, int *ssd ); + uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride ); - void (*ssd_nv12_core)( pixel *pixuv1, int stride1, - pixel *pixuv2, int stride2, int width, int height, + void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); - void (*ssim_4x4x2_core)( const pixel *pix1, int stride1, - const pixel *pix2, int stride2, int sums[2][4] ); + void (*ssim_4x4x2_core)( const pixel *pix1, intptr_t stride1, + const pixel *pix2, intptr_t stride2, int sums[2][4] ); float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width ); /* multiple parallel calls to cmp. */ @@ -143,9 +143,12 @@ typedef struct } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); -void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v ); -uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height ); -float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf, int *cnt ); +void x264_pixel_ssd_nv12 ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, + int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v ); +uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, + int i_width, int i_height ); +float x264_pixel_ssim_wxh ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2, + int i_width, int i_height, void *buf, int *cnt ); int x264_field_vsad( x264_t *h, int mb_x, int mb_y ); #endif diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c index a9e862e3..dea872ba 100644 --- a/common/ppc/deblock.c +++ b/common/ppc/deblock.c @@ -267,7 +267,7 @@ static inline vec_u8_t h264_deblock_q1( register vec_u8_t p0, register vec_u8_t q1 = newq1; \ } -void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0 ) { @@ -285,7 +285,7 @@ void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, } } -void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) { register vec_u8_t line0, line1, line2, line3, line4, line5; diff --git a/common/ppc/mc.c b/common/ppc/mc.c index 0fc735c2..2e720f47 100644 --- a/common/ppc/mc.c +++ b/common/ppc/mc.c @@ -37,8 +37,8 @@ #include "ppccommon.h" #if !HIGH_BIT_DEPTH -typedef void (*pf_mc_t)( uint8_t *src, int i_src, - uint8_t *dst, int i_dst, int i_height ); +typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src, + uint8_t *dst, intptr_t i_dst, int i_height ); static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; @@ -58,8 +58,8 @@ static inline int x264_tapfilter1( uint8_t *pix ) } -static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, int i_dst, - uint8_t *src1, int i_src1, +static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst, + uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { for( int y = 0; y < i_height; y++ ) @@ -72,8 +72,8 @@ static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, int i_dst, } } -static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, int i_dst, - uint8_t *src1, int i_src1, +static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, intptr_t i_dst, + uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { vec_u8_t src1v, src2v; @@ -95,8 +95,8 @@ static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, int i_dst, } } -static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, int i_dst, - uint8_t *src1, int i_src1, +static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, intptr_t i_dst, + uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { vec_u8_t src1v, src2v; @@ -117,8 +117,8 @@ static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, int i_dst, } } -static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, int i_dst, - uint8_t *src1, int i_src1, +static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, intptr_t i_dst, + uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) { x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height); @@ -128,8 +128,8 @@ static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, int i_dst, /* mc_copy: plain c */ #define MC_COPY( name, a ) \ -static void name( uint8_t *dst, int i_dst, \ - uint8_t *src, int i_src, int i_height ) \ +static void name( uint8_t *dst, intptr_t i_dst, \ + uint8_t *src, intptr_t i_src, int i_height ) \ { \ int y; \ for( y = 0; y < i_height; y++ ) \ @@ -142,14 +142,14 @@ static void name( uint8_t *dst, int i_dst, \ MC_COPY( x264_mc_copy_w4_altivec, 4 ) MC_COPY( x264_mc_copy_w8_altivec, 8 ) -static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int i_height ) +static void x264_mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst, + uint8_t *src, intptr_t i_src, int i_height ) { vec_u8_t cpyV; PREP_LOAD; PREP_LOAD_SRC( src ); - for( int y = 0; y < i_height; y++) + for( int y = 0; y < i_height; y++ ) { VEC_LOAD( src, cpyV, 16, vec_u8_t, src ); vec_st(cpyV, 0, dst); @@ -160,12 +160,12 @@ static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst, } -static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst, - uint8_t *src, int i_src, int i_height ) +static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst, + uint8_t *src, intptr_t i_src, int i_height ) { - for( int y = 0; y < i_height; ++y) + for( int y = 0; y < i_height; ++y ) { - vec_u8_t cpyV = vec_ld( 0, src); + vec_u8_t cpyV = vec_ld( 0, src ); vec_st(cpyV, 0, dst); src += i_src; @@ -174,13 +174,13 @@ static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst, } -static void mc_luma_altivec( uint8_t *dst, int i_dst_stride, - uint8_t *src[4], int i_src_stride, +static void mc_luma_altivec( uint8_t *dst, intptr_t i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); - int offset = (mvy>>2)*i_src_stride + (mvx>>2); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { @@ -222,13 +222,13 @@ static void mc_luma_altivec( uint8_t *dst, int i_dst_stride, -static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride, - uint8_t *src[4], int i_src_stride, +static uint8_t *get_ref_altivec( uint8_t *dst, intptr_t *i_dst_stride, + uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); - int offset = (mvy>>2)*i_src_stride + (mvx>>2); + intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { @@ -266,10 +266,9 @@ static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride, } } -static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, - uint8_t *src, int i_src_stride, - int mvx, int mvy, - int i_height ) +static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, + uint8_t *src, intptr_t i_src_stride, + int mvx, int mvy, int i_height ) { uint8_t *srcp; int d8x = mvx&0x07; @@ -297,10 +296,9 @@ static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, } } -static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, - uint8_t *src, int i_src_stride, - int mvx, int mvy, - int i_height ) +static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, + uint8_t *src, intptr_t i_src_stride, + int mvx, int mvy, int i_height ) { uint8_t *srcp; int d8x = mvx & 0x07; @@ -386,10 +384,9 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_strid } } -static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, - uint8_t *src, int i_src_stride, - int mvx, int mvy, - int i_height ) +static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, + uint8_t *src, intptr_t i_src_stride, + int mvx, int mvy, int i_height ) { uint8_t *srcp; int d8x = mvx & 0x07; @@ -510,10 +507,9 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_strid } } -static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, - uint8_t *src, int i_src_stride, - int mvx, int mvy, - int i_width, int i_height ) +static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, + uint8_t *src, intptr_t i_src_stride, + int mvx, int mvy, int i_width, int i_height ) { if( i_width == 8 ) mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride, @@ -670,7 +666,7 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride, } void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, - int i_stride, int i_width, int i_height, int16_t *buf ) + intptr_t i_stride, int i_width, int i_height, int16_t *buf ) { vec_u8_t destv; vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v; @@ -765,7 +761,7 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint } static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, - int src_stride, int dst_stride, int width, int height ) + intptr_t src_stride, intptr_t dst_stride, int width, int height ) { int w = width >> 4; int end = (width & 15); @@ -857,7 +853,7 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_ } } -static void mc_weight_w2_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, +static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; @@ -911,7 +907,7 @@ static void mc_weight_w2_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_s } } } -static void mc_weight_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, +static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; @@ -965,7 +961,7 @@ static void mc_weight_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_s } } } -static void mc_weight_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, +static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; @@ -1020,7 +1016,7 @@ static void mc_weight_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_s } } } -static void mc_weight_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, +static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; @@ -1080,7 +1076,7 @@ static void mc_weight_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_ } } } -static void mc_weight_w20_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src, +static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src, const x264_weight_t *weight, int i_height ) { LOAD_ZERO; diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index b60bfaf0..585bc197 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -34,8 +34,8 @@ **********************************************************************/ #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \ -static int name( uint8_t *pix1, int i_pix1, \ - uint8_t *pix2, int i_pix2 ) \ +static int name( uint8_t *pix1, intptr_t i_pix1, \ + uint8_t *pix2, intptr_t i_pix2 ) \ { \ ALIGNED_16( int sum ); \ \ @@ -119,8 +119,8 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 ) /*********************************************************************** * SATD 4x4 **********************************************************************/ -static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); @@ -164,8 +164,8 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1, /*********************************************************************** * SATD 4x8 **********************************************************************/ -static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); @@ -218,8 +218,8 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1, /*********************************************************************** * SATD 8x4 **********************************************************************/ -static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); @@ -272,8 +272,8 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1, /*********************************************************************** * SATD 8x8 **********************************************************************/ -static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); @@ -332,8 +332,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1, /*********************************************************************** * SATD 8x16 **********************************************************************/ -static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); @@ -416,8 +416,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1, /*********************************************************************** * SATD 16x8 **********************************************************************/ -static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); @@ -500,8 +500,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1, /*********************************************************************** * SATD 16x16 **********************************************************************/ -static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { ALIGNED_16( int i_satd ); @@ -632,7 +632,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1, static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, - int i_stride, int scores[4] ) + intptr_t i_stride, int scores[4] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -744,7 +744,7 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc, static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, - int i_stride, int scores[3] ) + intptr_t i_stride, int scores[3] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -834,7 +834,8 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0, scores[2] = sum2; } -static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] ) +static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, + uint8_t *pix3, intptr_t i_stride, int scores[4] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -945,7 +946,7 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, - int i_stride, int scores[3] ) + intptr_t i_stride, int scores[3] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -1038,7 +1039,7 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, static void pixel_sad_x4_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, - int i_stride, int scores[4] ) + intptr_t i_stride, int scores[4] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -1152,7 +1153,7 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc, static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, - int i_stride, int scores[3] ) + intptr_t i_stride, int scores[3] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -1247,7 +1248,7 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0, static void pixel_sad_x4_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, - int i_stride, int scores[4] ) + intptr_t i_stride, int scores[4] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -1361,7 +1362,7 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc, static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, - int i_stride, int scores[3] ) + intptr_t i_stride, int scores[3] ) { ALIGNED_16( int sum0 ); ALIGNED_16( int sum1 ); @@ -1457,8 +1458,8 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0, * SSD routines **********************************************************************/ -static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1, - uint8_t *pix2, int i_stride_pix2) +static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1, + uint8_t *pix2, intptr_t i_stride_pix2 ) { ALIGNED_16( int sum ); @@ -1536,8 +1537,8 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1, return sum; } -static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1, - uint8_t *pix2, int i_stride_pix2) +static int pixel_ssd_8x8_altivec ( uint8_t *pix1, intptr_t i_stride_pix1, + uint8_t *pix2, intptr_t i_stride_pix2 ) { ALIGNED_16( int sum ); @@ -1588,7 +1589,7 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1, /**************************************************************************** * variance ****************************************************************************/ -static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride ) +static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride ) { ALIGNED_16(uint32_t sum_tab[4]); ALIGNED_16(uint32_t sqr_tab[4]); @@ -1615,7 +1616,7 @@ static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride ) return sum + ((uint64_t)sqr<<32); } -static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride ) +static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride ) { ALIGNED_16(uint32_t sum_tab[4]); ALIGNED_16(uint32_t sqr_tab[4]); @@ -1713,8 +1714,8 @@ static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride ) sa8d7v = vec_sub(b6v, b7v); \ } -static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { int32_t i_satd=0; @@ -1781,21 +1782,21 @@ static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1, return i_satd; } -static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { int32_t i_satd; i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2; return i_satd; } -static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1, - uint8_t *pix2, int i_pix2 ) +static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1, + uint8_t *pix2, intptr_t i_pix2 ) { int32_t i_satd; - i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 ) - + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 ) + i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 ) + + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 ) + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 ) + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2; return i_satd; @@ -1817,7 +1818,7 @@ static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1, vec_s16_t pix16_s##num = (vec_s16_t)vec_perm(pix8_##num, zero_u8v, perm); \ vec_s16_t pix16_d##num; -static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm ) +static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm ) { ALIGNED_16( int32_t sum4_tab[4] ); ALIGNED_16( int32_t sum8_tab[4] ); @@ -1903,7 +1904,7 @@ static const vec_u8_t hadamard_permtab[] = 0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F ) }; -static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, int stride ) +static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride ) { int idx = ((uintptr_t)pix & 8) >> 3; vec_u8_t permh = hadamard_permtab[idx]; @@ -1915,7 +1916,7 @@ static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, int stride ) return ((sum>>34)<<32) + ((uint32_t)sum>>1); } -static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, int stride ) +static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride ) { int idx = ((uintptr_t)pix & 8) >> 3; vec_u8_t permh = hadamard_permtab[idx]; @@ -1925,7 +1926,7 @@ static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, int stride ) return ((sum>>34)<<32) + ((uint32_t)sum>>1); } -static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, int stride ) +static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride ) { vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ]; uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm ); @@ -1933,7 +1934,7 @@ static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, int stride ) return ((sum>>34)<<32) + ((uint32_t)sum>>1); } -static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, int stride ) +static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride ) { vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ]; uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm ); @@ -1944,8 +1945,8 @@ static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, int stride ) /**************************************************************************** * structural similarity metric ****************************************************************************/ -static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1, - const uint8_t *pix2, int stride2, +static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1, + const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) { ALIGNED_16( int temp[4] ); @@ -1986,13 +1987,15 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1, } #define SATD_X( size ) \ -static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\ +static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\ + intptr_t i_stride, int scores[3] )\ {\ scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\ }\ -static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\ +static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\ + uint8_t *pix3, intptr_t i_stride, int scores[4] )\ {\ scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\ scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\ diff --git a/common/sparc/pixel.h b/common/sparc/pixel.h index 32498db4..0c762d4a 100644 --- a/common/sparc/pixel.h +++ b/common/sparc/pixel.h @@ -26,9 +26,9 @@ #ifndef X264_SPARC_PIXEL_H #define X264_SPARC_PIXEL_H -int x264_pixel_sad_8x8_vis( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_8x16_vis( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_16x8_vis( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_16x16_vis( uint8_t *, int, uint8_t *, int ); +int x264_pixel_sad_8x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sad_8x16_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sad_16x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sad_16x16_vis( uint8_t *, intptr_t, uint8_t *, intptr_t ); #endif diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 7622eb69..e7c5dc31 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -160,7 +160,7 @@ cextern pw_pixel_max %macro DEBLOCK_LUMA 0 ;----------------------------------------------------------------------------- -; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_luma, 5,5,8 %assign pad 5*mmsize+12-(stack_offset&15) @@ -603,7 +603,7 @@ DEBLOCK_LUMA_64 %if ARCH_X86_64 ;----------------------------------------------------------------------------- -; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA_INTRA_64 0 cglobal deblock_v_luma_intra, 4,7,16 @@ -653,7 +653,7 @@ cglobal deblock_v_luma_intra, 4,7,16 REP_RET ;----------------------------------------------------------------------------- -; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,7,16 %define t0 m15 @@ -722,7 +722,7 @@ DEBLOCK_LUMA_INTRA_64 %macro DEBLOCK_LUMA_INTRA 0 ;----------------------------------------------------------------------------- -; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_luma_intra, 4,7,8 LUMA_INTRA_INIT 3 @@ -748,7 +748,7 @@ cglobal deblock_v_luma_intra, 4,7,8 RET ;----------------------------------------------------------------------------- -; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,7,8 LUMA_INTRA_INIT 8 @@ -1090,7 +1090,7 @@ DEBLOCK_LUMA_INTRA %if ARCH_X86_64 ;----------------------------------------------------------------------------- -; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %macro DEBLOCK_LUMA 0 cglobal deblock_v_luma, 5,5,10 @@ -1135,12 +1135,11 @@ cglobal deblock_v_luma, 5,5,10 RET ;----------------------------------------------------------------------------- -; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname cglobal deblock_h_luma, 5,9 - movsxd r7, r1d - lea r8, [r7*3] + lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] %if WIN64 @@ -1152,14 +1151,15 @@ cglobal deblock_h_luma, 5,9 %endif ; transpose 6x16 -> tmp space - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp - lea r6, [r6+r7*8] - lea r5, [r5+r7*8] - TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp + lea r6, [r6+r1*8] + lea r5, [r5+r1*8] + TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp+8 ; vertical filter ; alpha, beta, tc0 are still in r2d, r3d, r4 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them + mov r7, r1 lea r0, [pix_tmp+0x30] mov r1d, 0x10 %if WIN64 @@ -1203,7 +1203,7 @@ DEBLOCK_LUMA %macro DEBLOCK_LUMA 2 ;----------------------------------------------------------------------------- -; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_%1_luma, 5,5 lea r4, [r1*3] @@ -1255,7 +1255,7 @@ cglobal deblock_%1_luma, 5,5 RET ;----------------------------------------------------------------------------- -; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- INIT_MMX cpuname cglobal deblock_h_luma, 0,5 @@ -1452,7 +1452,7 @@ DEBLOCK_LUMA v, 16 %endif ;----------------------------------------------------------------------------- -; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_%1_luma_intra, 4,6,16 %if ARCH_X86_64 == 0 @@ -1514,24 +1514,24 @@ cglobal deblock_%1_luma_intra, 4,6,16 INIT_MMX cpuname %if ARCH_X86_64 ;----------------------------------------------------------------------------- -; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_luma_intra, 4,9 - movsxd r7, r1d - lea r8, [r7*3] + lea r8, [r1*3] lea r6, [r0-4] lea r5, [r0-4+r8] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) - lea r6, [r6+r7*8] - lea r5, [r5+r7*8] - TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) - - lea r0, [pix_tmp+0x40] - mov r1, 0x10 + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) + lea r6, [r6+r1*8] + lea r5, [r5+r1*8] + TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) + + mov r7, r1 + lea r0, [pix_tmp+0x40] + mov r1, 0x10 call deblock_v_luma_intra ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) @@ -1685,9 +1685,9 @@ cglobal deblock_inter_body ret ;----------------------------------------------------------------------------- -; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- -cglobal deblock_v_chroma, 7,7,8 +cglobal deblock_v_chroma, 5,7,8 FIX_STRIDES r1 mov r5, r0 sub r0, r1 @@ -1705,7 +1705,7 @@ cglobal deblock_v_chroma, 7,7,8 REP_RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 add r1, r1 @@ -1731,7 +1731,7 @@ cglobal deblock_intra_body ret ;----------------------------------------------------------------------------- -; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma_intra, 4,6,8 add r1, r1 @@ -1752,7 +1752,7 @@ cglobal deblock_v_chroma_intra, 4,6,8 REP_RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 add r1, r1 @@ -1770,7 +1770,7 @@ cglobal deblock_h_chroma_intra, 4,6,8 REP_RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma_intra_mbaff( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra_mbaff, 4,6,8 add r1, r1 @@ -1793,7 +1793,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8 REP_RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma_mbaff( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_mbaff, 5,7,8 add r1, r1 @@ -1821,7 +1821,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8 REP_RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma_422_intra( uint16_t *pix, int stride, int alpha, int beta ) +; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_422_intra, 4,6,8 add r1, r1 @@ -1839,7 +1839,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8 REP_RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma_422( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_422, 5,7,8 add r1, r1 @@ -1940,7 +1940,7 @@ cglobal chroma_inter_body ret ;----------------------------------------------------------------------------- -; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma, 5,6,8 CHROMA_V_START @@ -1955,7 +1955,7 @@ cglobal deblock_v_chroma, 5,6,8 RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma, 5,7,8 CHROMA_H_START @@ -1980,7 +1980,7 @@ DEBLOCK_CHROMA %endif ;----------------------------------------------------------------------------- -; void deblock_h_chroma_mbaff( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) +; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- %macro DEBLOCK_H_CHROMA_420_MBAFF 0 cglobal deblock_h_chroma_mbaff, 5,7,8 @@ -2076,7 +2076,7 @@ cglobal chroma_intra_body %macro DEBLOCK_CHROMA_INTRA 0 ;----------------------------------------------------------------------------- -; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_v_chroma_intra, 4,5,8 CHROMA_V_START @@ -2091,7 +2091,7 @@ cglobal deblock_v_chroma_intra, 4,5,8 RET ;----------------------------------------------------------------------------- -; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- cglobal deblock_h_chroma_intra, 4,6,8 CHROMA_H_START @@ -2132,7 +2132,7 @@ DEBLOCK_CHROMA_INTRA %endif ;----------------------------------------------------------------------------- -; void deblock_h_chroma_intra_mbaff( uint8_t *pix, int stride, int alpha, int beta ) +; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal deblock_h_chroma_intra_mbaff, 4,6,8 diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 689999ee..923a2cd3 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -61,8 +61,7 @@ cextern pd_32 %if WIN64 DECLARE_REG_TMP 0,1,2,3,4,5,4,5 %macro AVG_START 0-1 0 - PROLOGUE 5,7,%1 - movsxd r5, dword r5m + PROLOGUE 6,7,%1 %endmacro %elif UNIX64 DECLARE_REG_TMP 0,1,2,3,4,5,7,8 @@ -190,7 +189,7 @@ cextern pd_32 %endif ;HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight ) +; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight ) ;----------------------------------------------------------------------------- %macro AVG_WEIGHT 1-2 0 cglobal pixel_avg_weight_w%1 @@ -403,7 +402,7 @@ AVG_WEIGHT 16, 7 %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h ) +;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h ) ;----------------------------------------------------------------------------- %macro WEIGHTER 1 @@ -479,7 +478,7 @@ WEIGHTER 20 %endmacro ;----------------------------------------------------------------------------- -;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h ) +;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h ) ;----------------------------------------------------------------------------- %macro OFFSET 2 cglobal mc_offset%2_w%1, 6,6 @@ -524,8 +523,8 @@ OFFSETPN 8 ;============================================================================= ;----------------------------------------------------------------------------- -; void pixel_avg_4x4( pixel *dst, int dst_stride, -; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight ); +; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int weight ); ;----------------------------------------------------------------------------- %macro AVGH 2 cglobal pixel_avg_%1x%2 @@ -540,9 +539,8 @@ cglobal pixel_avg_%1x%2 %endmacro ;----------------------------------------------------------------------------- -; void pixel_avg_w4( pixel *dst, int dst_stride, -; pixel *src1, int src1_stride, pixel *src2, int src2_stride, -; int height, int weight ); +; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, +; pixel *src2, intptr_t src2_stride, int height, int weight ); ;----------------------------------------------------------------------------- %macro AVG_FUNC 3 @@ -648,8 +646,8 @@ AVGH 4, 2 %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void pixel_avg2_wN( uint16_t *dst, int dst_stride, -; uint16_t *src1, int src_stride, +; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride, +; uint16_t *src1, intptr_t src_stride, ; uint16_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W_ONE 1 @@ -832,8 +830,8 @@ cglobal pixel_avg2_w18_sse2, 6,7,6 %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void pixel_avg2_w4( uint8_t *dst, int dst_stride, -; uint8_t *src1, int src_stride, +; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride, +; uint8_t *src1, intptr_t src_stride, ; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 @@ -1194,8 +1192,8 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k %endmacro ;----------------------------------------------------------------------------- -; void mc_copy_w4( uint8_t *dst, int i_dst_stride, -; uint8_t *src, int i_src_stride, int i_height ) +; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride, +; uint8_t *src, intptr_t i_src_stride, int i_height ) ;----------------------------------------------------------------------------- INIT_MMX cglobal mc_copy_w4_mmx, 4,6 @@ -1250,14 +1248,14 @@ MC_COPY 16 ; FIXME doesn't cover all pixels in high depth and/or 4:4:4 ;----------------------------------------------------------------------------- -; void prefetch_fenc( pixel *pix_y, int stride_y, -; pixel *pix_uv, int stride_uv, int mb_x ) +; void prefetch_fenc( pixel *pix_y, intptr_t stride_y, +; pixel *pix_uv, intptr_t stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %macro PREFETCH_FENC 1 %if ARCH_X86_64 cglobal prefetch_fenc_%1, 5,5 - FIX_STRIDES r1d, r3d + FIX_STRIDES r1, r3 and r4d, 3 mov eax, r4d imul r4d, r1d @@ -1317,11 +1315,11 @@ PREFETCH_FENC 420 PREFETCH_FENC 422 ;----------------------------------------------------------------------------- -; void prefetch_ref( pixel *pix, int stride, int parity ) +; void prefetch_ref( pixel *pix, intptr_t stride, int parity ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal prefetch_ref, 3,3 - FIX_STRIDES r1d + FIX_STRIDES r1 dec r2d and r2d, r1d lea r0, [r0+r2*8+64*SIZEOF_PIXEL] @@ -1397,8 +1395,8 @@ cglobal prefetch_ref, 3,3 %endif ; HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride, -; uint8_t *src, int src_stride, +; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride, +; uint8_t *src, intptr_t src_stride, ; int dx, int dy, ; int width, int height ) ;----------------------------------------------------------------------------- diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 0c15a416..12bec5b2 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -155,14 +155,11 @@ cextern pd_ffff %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width ); +; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width ); ;----------------------------------------------------------------------------- %macro HPEL_FILTER 0 cglobal hpel_filter_v, 5,6,11 - FIX_STRIDES r3d, r4d -%if WIN64 - movsxd r4, r4d -%endif + FIX_STRIDES r3, r4 lea r5, [r1+r3] sub r1, r3 sub r1, r3 @@ -179,7 +176,7 @@ cglobal hpel_filter_v, 5,6,11 %define s30 [pad30] %endif add r0, r4 - lea r2, [r2+r4] + add r2, r4 neg r4 mova m7, [pw_pixel_max] pxor m0, m0 @@ -216,12 +213,12 @@ cglobal hpel_filter_v, 5,6,11 REP_RET ;----------------------------------------------------------------------------- -; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_c, 3,3,10 add r2, r2 add r0, r2 - lea r1, [r1+r2] + add r1, r2 neg r2 mova m0, [tap1] mova m7, [tap3] @@ -265,7 +262,7 @@ cglobal hpel_filter_c, 3,3,10 REP_RET ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width ); +; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_h, 3,4,8 %define src r1+r2 @@ -317,12 +314,9 @@ HPEL_FILTER %if HIGH_BIT_DEPTH == 0 %macro HPEL_V 1 ;----------------------------------------------------------------------------- -; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width ); +; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_v, 5,6,%1 -%if WIN64 - movsxd r4, r4d -%endif lea r5, [r1+r3] sub r1, r3 sub r1, r3 @@ -375,7 +369,7 @@ cglobal hpel_filter_v, 5,6,%1 %endmacro ;----------------------------------------------------------------------------- -; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- INIT_MMX cglobal hpel_filter_c_mmx2, 3,3 @@ -405,7 +399,7 @@ cglobal hpel_filter_c_mmx2, 3,3 REP_RET ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_h_mmx2, 3,3 add r0, r2 @@ -452,7 +446,7 @@ INIT_XMM %macro HPEL_C 0 ;----------------------------------------------------------------------------- -; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); +; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_c, 3,3,9 add r0, r2 @@ -520,7 +514,7 @@ cglobal hpel_filter_c, 3,3,9 %endmacro ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- cglobal hpel_filter_h_sse2, 3,3,8 add r0, r2 @@ -568,7 +562,7 @@ cglobal hpel_filter_h_sse2, 3,3,8 REP_RET ;----------------------------------------------------------------------------- -; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width ); ;----------------------------------------------------------------------------- %macro HPEL_H 0 cglobal hpel_filter_h, 3,3 @@ -739,15 +733,11 @@ HPEL_H %macro HPEL 0 ;----------------------------------------------------------------------------- ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, -; uint8_t *src, int stride, int width, int height) +; uint8_t *src, intptr_t stride, int width, int height ) ;----------------------------------------------------------------------------- cglobal hpel_filter, 7,9,16 -%if WIN64 - movsxd r4, r4d - movsxd r5, r5d -%endif mov r7, r3 - sub r5, 16 + sub r5d, 16 mov r8, r1 and r7, 15 sub r3, r7 @@ -815,21 +805,20 @@ HPEL %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void plane_copy_core( pixel *dst, int i_dst, -; pixel *src, int i_src, int w, int h) +; void plane_copy_core( pixel *dst, intptr_t i_dst, +; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>w INIT_MMX cglobal plane_copy_core_mmx2, 6,7 - FIX_STRIDES r1d, r3d, r4d - movsxdifnidn r1, r1d - movsxdifnidn r3, r3d + FIX_STRIDES r1, r3, r4d +%if HIGH_BIT_DEPTH == 0 movsxdifnidn r4, r4d +%endif sub r1, r4 sub r3, r4 .loopy: - mov r6d, r4d - sub r6d, 63 + lea r6d, [r4-63] .loopx: prefetchnta [r2+256] movq m0, [r2 ] @@ -958,22 +947,19 @@ cglobal plane_copy_core_mmx2, 6,7 %macro PLANE_INTERLEAVE 0 ;----------------------------------------------------------------------------- -; void plane_copy_interleave_core( uint8_t *dst, int i_dst, -; uint8_t *srcu, int i_srcu, -; uint8_t *srcv, int i_srcv, int w, int h ) +; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst, +; uint8_t *srcu, intptr_t i_srcu, +; uint8_t *srcv, intptr_t i_srcv, int w, int h ) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>2*w -cglobal plane_copy_interleave_core, 7,9 - FIX_STRIDES r1d, r3d, r5d, r6d +cglobal plane_copy_interleave_core, 6,9 + mov r6d, r6m %if HIGH_BIT_DEPTH - mov r1m, r1d - mov r3m, r3d - mov r6m, r6d + FIX_STRIDES r1, r3, r5, r6d + movifnidn r1mp, r1 + movifnidn r3mp, r3 + mov r6m, r6d %endif - movsxdifnidn r1, r1d - movsxdifnidn r3, r3d - movsxdifnidn r5, r5d - movsxdifnidn r6, r6d lea r0, [r0+r6*2] add r2, r6 add r4, r6 @@ -1028,10 +1014,10 @@ cglobal plane_copy_interleave_core, 7,9 RET ;----------------------------------------------------------------------------- -; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height ) +; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) ;----------------------------------------------------------------------------- cglobal store_interleave_chroma, 5,5 - FIX_STRIDES r1d + FIX_STRIDES r1 .loop: INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a @@ -1055,20 +1041,17 @@ cglobal store_interleave_chroma, 5,5 %macro PLANE_DEINTERLEAVE 0 ;----------------------------------------------------------------------------- -; void plane_copy_deinterleave( pixel *dstu, int i_dstu, -; pixel *dstv, int i_dstv, -; pixel *src, int i_src, int w, int h ) +; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, +; pixel *dstv, intptr_t i_dstv, +; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- cglobal plane_copy_deinterleave, 6,7 DEINTERLEAVE_START mov r6d, r6m - FIX_STRIDES r1d, r3d, r5d, r6d + FIX_STRIDES r1, r3, r5, r6d %if HIGH_BIT_DEPTH mov r6m, r6d %endif - movsxdifnidn r1, r1d - movsxdifnidn r3, r3d - movsxdifnidn r5, r5d add r0, r6 add r2, r6 lea r4, [r4+r6*2] @@ -1088,11 +1071,11 @@ cglobal plane_copy_deinterleave, 6,7 REP_RET ;----------------------------------------------------------------------------- -; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height ) +; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fenc, 4,4 DEINTERLEAVE_START - FIX_STRIDES r2d + FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a @@ -1103,11 +1086,11 @@ cglobal load_deinterleave_chroma_fenc, 4,4 REP_RET ;----------------------------------------------------------------------------- -; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height ) +; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) ;----------------------------------------------------------------------------- cglobal load_deinterleave_chroma_fdec, 4,4 DEINTERLEAVE_START - FIX_STRIDES r2d + FIX_STRIDES r2 .loop: DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a @@ -1236,7 +1219,7 @@ MEMZERO %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride ) +; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) ;----------------------------------------------------------------------------- INIT_XMM cglobal integral_init4h_sse4, 3,4 @@ -1291,7 +1274,7 @@ INTEGRAL_INIT8H %macro INTEGRAL_INIT_8V 0 ;----------------------------------------------------------------------------- -; void integral_init8v( uint16_t *sum8, int stride ) +; void integral_init8v( uint16_t *sum8, intptr_t stride ) ;----------------------------------------------------------------------------- cglobal integral_init8v, 3,3 shl r1, 1 @@ -1316,7 +1299,7 @@ INIT_XMM sse2 INTEGRAL_INIT_8V ;----------------------------------------------------------------------------- -; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride ) +; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) ;----------------------------------------------------------------------------- INIT_MMX cglobal integral_init4v_mmx, 3,5 @@ -1505,17 +1488,14 @@ cglobal integral_init4v_ssse3, 3,5 ;----------------------------------------------------------------------------- ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, -; int src_stride, int dst_stride, int width, int height ) +; intptr_t src_stride, intptr_t dst_stride, int width, int height ) ;----------------------------------------------------------------------------- %macro FRAME_INIT_LOWRES 0 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise %if HIGH_BIT_DEPTH shl dword r6m, 1 - FIX_STRIDES r5d + FIX_STRIDES r5 shl dword r7m, 1 -%endif -%if WIN64 - movsxd r5, r5d %endif ; src += 2*(height-1)*stride + 2*width mov r6d, r8m diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index 435f6bd9..8e587536 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -37,22 +37,22 @@ void func##_sse2 args;\ void func##_ssse3 args; -DECL_SUF( x264_pixel_avg_16x16, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_16x8, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_8x16, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_8x8, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_8x4, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_4x16, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_4x8, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_4x4, ( pixel *, int, pixel *, int, pixel *, int, int )) -DECL_SUF( x264_pixel_avg_4x2, ( pixel *, int, pixel *, int, pixel *, int, int )) +DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_8x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_8x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_8x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_4x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) +DECL_SUF( x264_pixel_avg_4x2, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) #define MC_WEIGHT(w,type) \ - void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); + void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); #define MC_WEIGHT_OFFSET(w,type) \ - void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \ - void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \ + void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \ + void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \ MC_WEIGHT(w,type) MC_WEIGHT_OFFSET( 4, mmx2 ) @@ -75,74 +75,73 @@ MC_WEIGHT( 20, ssse3 ) #undef MC_OFFSET #undef MC_WEIGHT -void x264_mc_copy_w4_mmx( pixel *, int, pixel *, int, int ); -void x264_mc_copy_w8_mmx( pixel *, int, pixel *, int, int ); -void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int ); -void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int ); -void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int ); -void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int ); -void x264_prefetch_fenc_420_mmx2( pixel *, int, pixel *, int, int ); -void x264_prefetch_fenc_422_mmx2( pixel *, int, pixel *, int, int ); -void x264_prefetch_ref_mmx2( pixel *, int, int ); -void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h); -void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h ); -void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst, - pixel *srcu, int i_srcu, - pixel *srcv, int i_srcv, int w, int h ); -void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst, - pixel *srcu, int i_srcu, - pixel *srcv, int i_srcv, int w, int h ); -void x264_plane_copy_interleave_core_avx( pixel *dst, int i_dst, - pixel *srcu, int i_srcu, - pixel *srcv, int i_srcv, int w, int h ); -void x264_plane_copy_interleave_c( pixel *dst, int i_dst, - pixel *srcu, int i_srcu, - pixel *srcv, int i_srcv, int w, int h ); -void x264_plane_copy_deinterleave_mmx( pixel *dstu, int i_dstu, - pixel *dstv, int i_dstv, - pixel *src, int i_src, int w, int h ); -void x264_plane_copy_deinterleave_sse2( pixel *dstu, int i_dstu, - pixel *dstv, int i_dstv, - pixel *src, int i_src, int w, int h ); -void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu, - uint8_t *dstv, int i_dstv, - uint8_t *src, int i_src, int w, int h ); -void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu, - uint16_t *dstv, int i_dstv, - uint16_t *src, int i_src, int w, int h ); -void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ); -void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ); -void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height ); -void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height ); -void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height ); -void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height ); -void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height ); -void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height ); -void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height ); -void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height ); -void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height ); -void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n ); -void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n ); -void x264_memzero_aligned_mmx( void * dst, int n ); -void x264_memzero_aligned_sse2( void * dst, int n ); -void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride ); -void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride ); -void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, int stride ); -void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride ); -void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride ); -void x264_integral_init8v_mmx( uint16_t *sum8, int stride ); -void x264_integral_init8v_sse2( uint16_t *sum8, int stride ); -void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride ); +void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); +void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); +void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu, + uint8_t *dstv, intptr_t i_dstv, + uint8_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint16_t *src, intptr_t i_src, int w, int h ); +void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fenc_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fdec_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); +void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n ); +void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_mmx ( void *dst, size_t n ); +void x264_memzero_aligned_sse2( void *dst, size_t n ); +void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); +void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); +void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride ); +void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); +void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); +void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride ); +void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); +void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ -void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\ - pixel *src, int i_src,\ +void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\ int dx, int dy, int i_width, int i_height ); MC_CHROMA(mmx2) MC_CHROMA(sse2) @@ -154,7 +153,7 @@ MC_CHROMA(avx_cache64) #define LOWRES(cpu)\ void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\ - int src_stride, int dst_stride, int width, int height ); + intptr_t src_stride, intptr_t dst_stride, int width, int height ); LOWRES(mmx2) LOWRES(cache32_mmx2) LOWRES(sse2) @@ -163,7 +162,7 @@ LOWRES(avx) LOWRES(xop) #define PIXEL_AVG_W(width,cpu)\ -void x264_pixel_avg2_w##width##_##cpu( pixel *, int, pixel *, int, pixel *, int ); +void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ); /* This declares some functions that don't exist, but that isn't a problem. */ #define PIXEL_AVG_WALL(cpu)\ PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu); @@ -177,7 +176,7 @@ PIXEL_AVG_WALL(sse2_misalign) PIXEL_AVG_WALL(cache64_ssse3) #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\ -static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, int, pixel *, int, pixel *, int ) =\ +static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\ {\ NULL,\ x264_pixel_avg2_w4_##name1,\ @@ -216,7 +215,7 @@ PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, #endif // HIGH_BIT_DEPTH #define MC_COPY_WTAB(instr, name1, name2, name3)\ -static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int, int ) =\ +static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\ {\ NULL,\ x264_mc_copy_w4_##name1,\ @@ -233,7 +232,7 @@ MC_COPY_WTAB(sse2,mmx,mmx,sse2) #endif #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\ - static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\ + static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\ {\ x264_mc_##function##_w4_##name1,\ x264_mc_##function##_w4_##name1,\ @@ -332,10 +331,10 @@ static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; #define MC_LUMA(name,instr1,instr2)\ -static void mc_luma_##name( pixel *dst, int i_dst_stride,\ - pixel *src[4], int i_src_stride,\ - int mvx, int mvy,\ - int i_width, int i_height, const x264_weight_t *weight )\ +static void mc_luma_##name( pixel *dst, intptr_t i_dst_stride,\ + pixel *src[4], intptr_t i_src_stride,\ + int mvx, int mvy,\ + int i_width, int i_height, const x264_weight_t *weight )\ {\ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ @@ -367,10 +366,10 @@ MC_LUMA(cache64_ssse3,cache64_ssse3,sse2) #endif // !HIGH_BIT_DEPTH #define GET_REF(name)\ -static pixel *get_ref_##name( pixel *dst, int *i_dst_stride,\ - pixel *src[4], int i_src_stride,\ - int mvx, int mvy,\ - int i_width, int i_height, const x264_weight_t *weight )\ +static pixel *get_ref_##name( pixel *dst, intptr_t *i_dst_stride,\ + pixel *src[4], intptr_t i_src_stride,\ + int mvx, int mvy,\ + int i_width, int i_height, const x264_weight_t *weight )\ {\ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ @@ -410,13 +409,13 @@ GET_REF(cache64_ssse3) #endif // !HIGH_BIT_DEPTH #define HPEL(align, cpu, cpuv, cpuc, cpuh)\ -void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, int stride, int width);\ -void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, int width );\ -void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, int width );\ +void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\ +void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\ +void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\ static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\ - int stride, int width, int height, int16_t *buf )\ + intptr_t stride, int width, int height, int16_t *buf )\ {\ - int realign = (intptr_t)src & (align-1);\ + intptr_t realign = (intptr_t)src & (align-1);\ src -= realign;\ dstv -= realign;\ dstc -= realign;\ @@ -441,9 +440,9 @@ HPEL(16, sse2, sse2, sse2, sse2) #else // !HIGH_BIT_DEPTH HPEL(16, sse2_amd, mmx2, mmx2, sse2) #if ARCH_X86_64 -void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf ); -void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf ); -void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf ); +void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); +void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); +void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); #else HPEL(16, sse2, sse2, sse2, sse2) HPEL(16, ssse3, ssse3, ssse3, ssse3) @@ -452,7 +451,7 @@ HPEL(16, avx, avx, avx, avx) HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) #endif // HIGH_BIT_DEPTH -static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h ) +static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ) { int c_w = 16/sizeof(pixel) - 1; if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold. @@ -470,9 +469,9 @@ static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src, } #define PLANE_INTERLEAVE(cpu) \ -static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\ - pixel *srcu, int i_srcu,\ - pixel *srcv, int i_srcv, int w, int h )\ +static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ + pixel *srcu, intptr_t i_srcu,\ + pixel *srcv, intptr_t i_srcv, int w, int h )\ {\ if( !(w&15) ) {\ x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ diff --git a/common/x86/pixel-32.asm b/common/x86/pixel-32.asm index b3dcdbab..77a87421 100644 --- a/common/x86/pixel-32.asm +++ b/common/x86/pixel-32.asm @@ -67,7 +67,7 @@ INIT_MMX mmx2 %endmacro ;----------------------------------------------------------------------------- -; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int ) +; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal push r0 @@ -362,8 +362,8 @@ cglobal intra_sa8d_x3_8x8, 2,3 ;----------------------------------------------------------------------------- -; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, +; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- cglobal pixel_ssim_4x4x2_core, 0,5 mov r1, r1m diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 6f3076cf..06737ab1 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -144,7 +144,7 @@ cextern hsub_mul %if HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int ) +; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD_ONE 2 cglobal pixel_ssd_%1x%2, 4,5,6 @@ -361,7 +361,7 @@ SSD_ONE 16, 16 %endmacro ;----------------------------------------------------------------------------- -; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SSD 2 %if %1 != %2 @@ -466,7 +466,7 @@ SSD 8, 4 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2, +; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; The maximum width this function can handle without risk of overflow is given @@ -560,7 +560,7 @@ cglobal pixel_ssd_nv12_core, 6,7,7 %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- -; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, +; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2, ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ) ; ; This implementation can potentially overflow on image widths >= 11008 (or @@ -697,7 +697,7 @@ SSD_NV12 %endmacro ;----------------------------------------------------------------------------- -; int pixel_var_wxh( uint8_t *, int ) +; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_var_16x16, 2,3 @@ -820,7 +820,7 @@ VAR %endmacro ;----------------------------------------------------------------------------- -; int pixel_var2_8x8( pixel *, int, pixel *, int, int * ) +; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * ) ;----------------------------------------------------------------------------- %macro VAR2_8x8_MMX 2 cglobal pixel_var2_8x%1, 5,6 @@ -1128,7 +1128,7 @@ VAR2_8x8_SSSE3 16, 7 ; for small blocks on x86_32, modify pixel pointer instead. ;----------------------------------------------------------------------------- -; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 cglobal pixel_satd_16x4_internal @@ -1335,7 +1335,7 @@ cglobal pixel_satd_4x4, 4,6 %endmacro ;----------------------------------------------------------------------------- -; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int ) +; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SATDS_SSE2 0 %if cpuflag(ssse3) @@ -1476,7 +1476,7 @@ cglobal pixel_satd_8x4, 4,6,8 %if ARCH_X86_64 ;----------------------------------------------------------------------------- -; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int ) +; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sa8d_8x8_internal lea r6, [r0+4*r1] @@ -3841,8 +3841,8 @@ HADAMARD_AC_SSE2 ;============================================================================= ;----------------------------------------------------------------------------- -; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1, -; const uint8_t *pix2, int stride2, int sums[2][4] ) +; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, +; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) ;----------------------------------------------------------------------------- %macro SSIM_ITER 1 %if HIGH_BIT_DEPTH @@ -4006,8 +4006,10 @@ SSIM ;============================================================================= %macro ADS_START 0 -%if WIN64 +%if UNIX64 movsxd r5, r5d +%else + mov r5d, r5m %endif mov r0d, r5d lea r6, [r4+r5+15] @@ -4030,7 +4032,7 @@ SSIM ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh ) ;----------------------------------------------------------------------------- INIT_MMX mmx2 -cglobal pixel_ads4, 6,7 +cglobal pixel_ads4, 5,7 movq mm6, [r0] movq mm4, [r0+8] pshufw mm7, mm6, 0 @@ -4061,7 +4063,7 @@ cglobal pixel_ads4, 6,7 movd [r6], mm1 ADS_END 1 -cglobal pixel_ads2, 6,7 +cglobal pixel_ads2, 5,7 movq mm6, [r0] pshufw mm5, r6m, 0 pshufw mm7, mm6, 0 @@ -4082,7 +4084,7 @@ cglobal pixel_ads2, 6,7 movd [r6], mm4 ADS_END 1 -cglobal pixel_ads1, 6,7 +cglobal pixel_ads1, 5,7 pshufw mm7, [r0], 0 pshufw mm6, r6m, 0 ADS_START @@ -4104,7 +4106,7 @@ cglobal pixel_ads1, 6,7 ADS_END 2 %macro ADS_XMM 0 -cglobal pixel_ads4, 6,7,12 +cglobal pixel_ads4, 5,7,12 movdqa xmm4, [r0] pshuflw xmm7, xmm4, 0 pshuflw xmm6, xmm4, q2222 @@ -4168,7 +4170,7 @@ cglobal pixel_ads4, 6,7,12 %endif ; ARCH ADS_END 2 -cglobal pixel_ads2, 6,7,8 +cglobal pixel_ads2, 5,7,8 movq xmm6, [r0] movd xmm5, r6m pshuflw xmm7, xmm6, 0 @@ -4193,7 +4195,7 @@ cglobal pixel_ads2, 6,7,8 movq [r6], xmm1 ADS_END 2 -cglobal pixel_ads1, 6,7,8 +cglobal pixel_ads1, 5,7,8 movd xmm7, [r0] movd xmm6, r6m pshuflw xmm7, xmm7, 0 diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 11823f08..eeea9c70 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -39,11 +39,11 @@ ret x264_pixel_##name##_4x4_##suffix args;\ #define DECL_X1( name, suffix ) \ - DECL_PIXELS( int, name, suffix, ( pixel *, int, pixel *, int ) ) + DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) ) #define DECL_X4( name, suffix ) \ - DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, int, int * ) )\ - DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int * ) ) + DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) ) DECL_X1( sad, mmx2 ) DECL_X1( sad, sse2 ) @@ -84,16 +84,16 @@ DECL_X4( sad, cache64_mmx2 ); DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride )) -DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride )) +DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride )) void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * ); @@ -130,35 +130,35 @@ int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, u int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * ); -void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1, - pixel *pixuv2, int stride2, int width, +void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); -void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1, - pixel *pixuv2, int stride2, int width, +void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); -void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, int stride1, - pixel *pixuv2, int stride2, int width, +void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); -void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, int stride1, - const uint8_t *pix2, int stride2, int sums[2][4] ); -void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1, - const pixel *pix2, int stride2, int sums[2][4] ); -void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, int stride1, - const pixel *pix2, int stride2, int sums[2][4] ); +void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1, + const uint8_t *pix2, intptr_t stride2, int sums[2][4] ); +void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1, + const pixel *pix2, intptr_t stride2, int sums[2][4] ); +void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1, + const pixel *pix2, intptr_t stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); -float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width ); -int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * ); -int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * ); -int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * ); -int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * ); -int x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * ); -int x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * ); -int x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * ); -int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * ); -int x264_pixel_vsad_mmx2( pixel *src, int stride, int height ); -int x264_pixel_vsad_sse2( pixel *src, int stride, int height ); -int x264_pixel_vsad_ssse3( pixel *src, int stride, int height ); -int x264_pixel_vsad_xop( pixel *src, int stride, int height ); +float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width ); +int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); +int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); +int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); +int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); +int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height ); +int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height ); +int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height ); +int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height ); #define DECL_ADS( size, suffix ) \ int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\ diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 744b836d..883f0018 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -759,6 +759,7 @@ OPTIMIZE_CHROMA_2x2_DC %macro DENOISE_DCT 0 cglobal denoise_dct, 4,4,8 pxor m6, m6 + movsxdifnidn r3, r3d .loop: mova m2, [r0+r3*4-2*mmsize] mova m3, [r0+r3*4-1*mmsize] @@ -804,6 +805,7 @@ DENOISE_DCT %macro DENOISE_DCT 0 cglobal denoise_dct, 4,4,7 pxor m6, m6 + movsxdifnidn r3, r3d .loop: mova m2, [r0+r3*2-2*mmsize] mova m3, [r0+r3*2-1*mmsize] diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 58f4273e..50ad2d72 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -80,7 +80,7 @@ cextern sw_64 %endmacro ;----------------------------------------------------------------------------- -; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD 2 cglobal pixel_sad_%1x%2_mmx2, 4,4 @@ -116,7 +116,7 @@ SAD 4, 4 %macro SAD_W16 0 ;----------------------------------------------------------------------------- -; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int ) +; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x16, 4,4,8 movu m0, [r2] @@ -183,7 +183,7 @@ cglobal pixel_sad_16x16, 4,4,8 SAD_END_SSE2 ;----------------------------------------------------------------------------- -; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int ) +; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- cglobal pixel_sad_16x8, 4,4 movu m0, [r2] @@ -257,7 +257,7 @@ cglobal pixel_sad_8x16_sse2, 4,4 RET ;----------------------------------------------------------------------------- -; void pixel_vsad( pixel *src, int stride ); +; void pixel_vsad( pixel *src, intptr_t stride ); ;----------------------------------------------------------------------------- %if ARCH_X86_64 == 0 @@ -867,14 +867,10 @@ INTRA_SAD16 ;----------------------------------------------------------------------------- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) +; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2 -%if WIN64 - %assign i %1+1 - movsxd r %+ i, r %+ i %+ d -%endif SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 @@ -1190,14 +1186,10 @@ SAD_X 4, 4, 4 ;----------------------------------------------------------------------------- ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, -; uint8_t *pix2, int i_stride, int scores[3] ) +; uint8_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X_SSE2 3 cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9 -%if WIN64 - %assign i %1+1 - movsxd r %+ i, r %+ i %+ d -%endif SAD_X%1_2x%2P_SSE2 1 %rep %3/2-1 SAD_X%1_2x%2P_SSE2 0 @@ -1485,9 +1477,6 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6 %if ARCH_X86_64 PROLOGUE 6,9 mov r8, r6mp -%if WIN64 - movsxd r5, r5d -%endif push r4 push r3 push r2 diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm index 39f72598..273d0a01 100644 --- a/common/x86/sad16-a.asm +++ b/common/x86/sad16-a.asm @@ -87,7 +87,7 @@ cextern pw_8 %endmacro ;----------------------------------------------------------------------------- -; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int ) +; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD_MMX 3 cglobal pixel_sad_%1x%2, 4,4 @@ -152,7 +152,7 @@ SAD_MMX 4, 4, 2 %endmacro ;----------------------------------------------------------------------------- -; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int ) +; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;----------------------------------------------------------------------------- %macro SAD_XMM 2 cglobal pixel_sad_%1x%2, 4,4,8 @@ -402,15 +402,12 @@ PIXEL_VSAD ;----------------------------------------------------------------------------- ; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, -; uint16_t *pix2, int i_stride, int scores[3] ) +; uint16_t *pix2, intptr_t i_stride, int scores[3] ) ;----------------------------------------------------------------------------- %macro SAD_X 3 cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS %assign regnum %1+1 %xdefine STRIDE r %+ regnum -%if WIN64 - movsxd STRIDE, STRIDE %+ d -%endif mov r6, %3/2-1 SAD_X%1_ONE_START SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index f9be2cf0..4b2229ec 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -52,11 +52,6 @@ %define mangle(x) x %endif -; FIXME: All of the 64bit asm functions that take a stride as an argument -; via register, assume that the high dword of that register is filled with 0. -; This is true in practice (since we never do any 64bit arithmetic on strides, -; and x264's strides are all positive), but is not guaranteed by the ABI. - ; Name of the .rodata section. ; Kludge: Something on OS X fails to align .rodata even given an align attribute, ; so use a different read-only section. diff --git a/encoder/analyse.c b/encoder/analyse.c index 51691c93..e4778771 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1987,7 +1987,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) ALIGNED_ARRAY_16( pixel, pix0,[16*16] ); ALIGNED_ARRAY_16( pixel, pix1,[16*16] ); pixel *src0, *src1; - int stride0 = 16, stride1 = 16; + intptr_t stride0 = 16, stride1 = 16; int i_ref, i_mvc; ALIGNED_4( int16_t mvc[9][2] ); int try_skip = a->b_try_skip; @@ -2304,7 +2304,7 @@ static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t int y8 = i>>1; int i_part_cost; int i_part_cost_bi; - int stride[2] = {8,8}; + intptr_t stride[2] = {8,8}; pixel *src[2]; x264_me_t m; m.i_pixel = PIXEL_8x8; @@ -2393,7 +2393,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) int y8 = i>>1; int i_part_cost; int i_part_cost_bi = 0; - int stride[2] = {8,8}; + intptr_t stride[2] = {8,8}; pixel *src[2]; for( int l = 0; l < 2; l++ ) @@ -2464,7 +2464,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i { int i_part_cost; int i_part_cost_bi = 0; - int stride[2] = {16,16}; + intptr_t stride[2] = {16,16}; pixel *src[2]; x264_me_t m; m.i_pixel = PIXEL_16x8; @@ -2558,7 +2558,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i { int i_part_cost; int i_part_cost_bi = 0; - int stride[2] = {8,8}; + intptr_t stride[2] = {8,8}; pixel *src[2]; x264_me_t m; m.i_pixel = PIXEL_8x16; diff --git a/encoder/me.c b/encoder/me.c index ccc7ad40..7b11e01d 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -70,7 +70,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_HPEL( mx, my ) \ { \ - int stride2 = 16; \ + intptr_t stride2 = 16; \ pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ @@ -775,7 +775,7 @@ void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh #define COST_MV_SAD( mx, my ) \ { \ - int stride = 16; \ + intptr_t stride = 16; \ pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ @@ -785,7 +785,7 @@ void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh #define COST_MV_SATD( mx, my, dir ) \ if( b_refine_qpel || (dir^1) != odir ) \ { \ - int stride = 16; \ + intptr_t stride = 16; \ pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ @@ -854,7 +854,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite { int omx = bmx, omy = bmy; int costs[4]; - int stride = 64; // candidates are either all hpel or all qpel, so one stride is enough + intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough pixel *src0, *src1, *src2, *src3; src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] ); src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] ); @@ -988,7 +988,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m int ref1 = h->mb.cache.ref[1][s8]; const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - int stride[3][2][9]; + intptr_t stride[3][2][9]; int bm0x = m0->mv[0]; int bm0y = m0->mv[1]; int bm1x = m1->mv[0]; diff --git a/encoder/slicetype.c b/encoder/slicetype.c index c693b3f4..65ea761c 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -505,7 +505,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, } \ else \ { \ - int stride1 = 16, stride2 = 16; \ + intptr_t stride1 = 16, stride2 = 16; \ pixel *src1, *src2; \ src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \ (mv0)[0], (mv0)[1], 8, 8, w ); \ diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm index a0b85fac..47a4f65e 100644 --- a/tools/checkasm-a.asm +++ b/tools/checkasm-a.asm @@ -4,6 +4,7 @@ ;* Copyright (C) 2008-2012 x264 project ;* ;* Authors: Loren Merritt +;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -29,7 +30,7 @@ SECTION_RODATA error_message: db "failed to preserve register", 0 -%if WIN64 +%if ARCH_X86_64 ; just random numbers to reduce the chance of incidental match ALIGN 16 x6: ddq 0x79445c159ce790641a1b2550a612b48c @@ -60,64 +61,107 @@ cextern_naked puts ; (max_args % 4) must equal 3 for stack alignment %define max_args 15 +%if ARCH_X86_64 + +;----------------------------------------------------------------------------- +; void x264_checkasm_stack_clobber( uint64_t clobber, ... ) +;----------------------------------------------------------------------------- +cglobal checkasm_stack_clobber, 1,2 + ; Clobber the stack with junk below the stack pointer + %define size (max_args+6)*8 + SUB rsp, size + mov r1, size-8 +.loop: + mov [rsp+r1], r0 + sub r1, 8 + jge .loop + ADD rsp, size + RET + %if WIN64 + %assign free_regs 7 +%else + %assign free_regs 9 +%endif ;----------------------------------------------------------------------------- ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ) ;----------------------------------------------------------------------------- INIT_XMM -cglobal checkasm_call, 4,15,16 - SUB rsp, max_args*8 +cglobal checkasm_call, 2,15,16 + SUB rsp, max_args*8+16 mov r6, r0 - mov [rsp+stack_offset+16], r1 - mov r0, r2 - mov r1, r3 - mov r2d, r4m ; FIXME truncates pointer - mov r3d, r5m ; FIXME truncates pointer -%assign i 4 -%rep max_args-4 - mov r4, [rsp+stack_offset+8+(i+2)*8] - mov [rsp+i*8], r4 - %assign i i+1 -%endrep -%assign i 6 -%rep 16-6 - mova m %+ i, [x %+ i] - %assign i i+1 -%endrep -%assign i 7 -%rep 15-7 + mov [rsp+max_args*8], r1 + + ; All arguments have been pushed on the stack instead of registers in order to + ; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit. + mov r0, r6mp + mov r1, r7mp + mov r2, r8mp + mov r3, r9mp +%if UNIX64 + mov r4, r10mp + mov r5, r11mp + %assign i 6 + %rep max_args-6 + mov r9, [rsp+stack_offset+(i+1)*8] + mov [rsp+(i-6)*8], r9 + %assign i i+1 + %endrep +%else + %assign i 4 + %rep max_args-4 + mov r9, [rsp+stack_offset+(i+7)*8] + mov [rsp+i*8], r9 + %assign i i+1 + %endrep +%endif + +%if WIN64 + %assign i 6 + %rep 16-6 + mova m %+ i, [x %+ i] + %assign i i+1 + %endrep +%endif + +%assign i 14 +%rep 15-free_regs mov r %+ i, [n %+ i] - %assign i i+1 + %assign i i-1 %endrep call r6 -%assign i 7 -%rep 15-7 +%assign i 14 +%rep 15-free_regs xor r %+ i, [n %+ i] - or r7, r %+ i - %assign i i+1 -%endrep -%assign i 6 -%rep 16-6 - pxor m %+ i, [x %+ i] - por m6, m %+ i - %assign i i+1 + or r14, r %+ i + %assign i i-1 %endrep + +%if WIN64 + %assign i 6 + %rep 16-6 + pxor m %+ i, [x %+ i] + por m6, m %+ i + %assign i i+1 + %endrep packsswb m6, m6 movq r5, m6 - or r7, r5 + or r14, r5 +%endif + jz .ok - mov r4, rax + mov r9, rax lea r0, [error_message] call puts - mov r1, [rsp+stack_offset+16] + mov r1, [rsp+max_args*8] mov dword [r1], 0 - mov rax, r4 + mov rax, r9 .ok: - ADD rsp, max_args*8 + ADD rsp, max_args*8+16 RET -%elif ARCH_X86_64 == 0 +%else ; just random numbers to reduce the chance of incidental match %define n3 dword 0x6549315c diff --git a/tools/checkasm.c b/tools/checkasm.c index 144d28fa..b9b6b8ae 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -196,17 +196,33 @@ static void print_bench(void) #if ARCH_X86 || ARCH_X86_64 int x264_stack_pagealign( int (*func)(), int align ); + +/* detect when callee-saved regs aren't saved + * needs an explicit asm check because it only sometimes crashes in normal use. */ +intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); #else #define x264_stack_pagealign( func, align ) func() #endif #define call_c1(func,...) func(__VA_ARGS__) -#if ARCH_X86 || defined(_WIN64) -/* detect when callee-saved regs aren't saved. - * needs an explicit asm check because it only sometimes crashes in normal use. */ -intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); -#define call_a1(func,...) x264_checkasm_call((intptr_t(*)())func, &ok, __VA_ARGS__) +#if ARCH_X86_64 +/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit. + * This is done by clobbering the stack with junk around the stack pointer and calling the + * assembly function through x264_checkasm_call with added dummy arguments which forces all + * real arguments to be passed on the stack and not in registers. For 32-bit argument the + * upper half of the 64-bit register location on the stack will now contain junk. Note that + * this is dependant on compiler behaviour and that interrupts etc. at the wrong time may + * overwrite the junk written to the stack so there's no guarantee that it will always + * detect all functions that assumes zero-extension. + */ +void x264_checkasm_stack_clobber( uint64_t clobber, ... ); +#define call_a1(func,...) ({ \ + uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ + x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \ + x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); }) +#elif ARCH_X86 +#define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ ) #else #define call_a1 call_c1 #endif @@ -291,8 +307,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) used_asm = 1; \ for( int j = 0; j < 64; j++ ) \ { \ - res_c = call_c( pixel_c.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \ - res_asm = call_a( pixel_asm.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \ + res_c = call_c( pixel_c.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \ + res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ @@ -332,16 +348,16 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( int j = 0; j < 64; j++ ) \ { \ pixel *pix2 = pbuf2+j; \ - res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \ + res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \ res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \ res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \ if( N == 4 ) \ { \ res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \ - call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \ + call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \ } \ else \ - call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \ + call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ { \ ok = 0; \ @@ -350,9 +366,9 @@ static int check_pixel( int cpu_ref, int cpu_new ) res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \ } \ if( N == 4 ) \ - call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \ + call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \ else \ - call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \ + call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \ } \ } \ } \ @@ -367,8 +383,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) set_func_name( "%s_%s", "var", pixel_names[i] ); \ used_asm = 1; \ /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \ - call_c1( pixel_c.var[i], pbuf1, 16 ); \ - call_a1( pixel_asm.var[i], pbuf1, 16 ); \ + call_c1( pixel_c.var[i], pbuf1, 16 ); \ + call_a1( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \ uint64_t res_c = pixel_c.var[i]( pbuf1, 16 ); \ uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \ if( res_c != res_asm ) \ @@ -376,8 +392,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) ok = 0; \ fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \ } \ - call_c2( pixel_c.var[i], pbuf1, 16 ); \ - call_a2( pixel_asm.var[i], pbuf1, 16 ); \ + call_c2( pixel_c.var[i], pbuf1, (intptr_t)16 ); \ + call_a2( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \ } ok = 1; used_asm = 0; @@ -392,8 +408,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) int res_c, res_asm, ssd_c, ssd_asm; \ set_func_name( "%s_%s", "var2", pixel_names[i] ); \ used_asm = 1; \ - res_c = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \ - res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \ + res_c = call_c( pixel_c.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c ); \ + res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \ if( res_c != res_asm || ssd_c != ssd_asm ) \ { \ ok = 0; \ @@ -415,8 +431,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( int j = 0; j < 32; j++ ) { pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256; - call_c1( pixel_c.hadamard_ac[i], pbuf1, 16 ); - call_a1( pixel_asm.hadamard_ac[i], pbuf1, 16 ); + call_c1( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 ); + call_a1( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 ); uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 ); uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 ); if( rc != ra ) @@ -426,8 +442,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) break; } } - call_c2( pixel_c.hadamard_ac[i], pbuf1, 16 ); - call_a2( pixel_asm.hadamard_ac[i], pbuf1, 16 ); + call_c2( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 ); + call_a2( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 ); } report( "pixel hadamard_ac :" ); @@ -446,8 +462,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) for( int j = 0; j < 2 && ok; j++ ) { pixel *p = j ? pbuf4 : pbuf1; - res_c = call_c( pixel_c.vsad, p, 16, h ); - res_asm = call_a( pixel_asm.vsad, p, 16, h ); + res_c = call_c( pixel_c.vsad, p, (intptr_t)16, h ); + res_asm = call_a( pixel_asm.vsad, p, (intptr_t)16, h ); if( res_c != res_asm ) { ok = 0; @@ -627,8 +643,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n", res_u_c, res_v_c, res_u_a, res_v_a ); } - call_c( pixel_c.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_c, &res_v_c ); - call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_a, &res_v_a ); + call_c( pixel_c.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_c, &res_v_c ); + call_a( pixel_asm.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_a, &res_v_a ); } report( "ssd_nv12 :" ); @@ -648,8 +664,8 @@ static int check_pixel( int cpu_ref, int cpu_new ) fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a ); } set_func_name( "ssim_core" ); - call_c2( pixel_c.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums ); - call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums ); + call_c2( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); + call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums ); set_func_name( "ssim_end" ); call_c2( pixel_c.ssim_end4, sums, sums, 4 ); call_a2( pixel_asm.ssim_end4, sums, sums, 4 ); @@ -1054,8 +1070,8 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ for( int i = 0; i < 1024; i++ ) \ pbuf3[i] = pbuf4[i] = 0xCD; \ - call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \ - call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \ + call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \ + call_a( mc_a.mc_luma, dst2, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \ if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \ { \ fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \ @@ -1065,15 +1081,15 @@ static int check_mc( int cpu_ref, int cpu_new ) if( mc_a.get_ref != mc_ref.get_ref ) \ { \ pixel *ref = dst2; \ - int ref_stride = 32; \ + intptr_t ref_stride = 32; \ int w_checked = ( ( sizeof(pixel) == 2 && (w == 12 || w == 20)) ? w-2 : w ); \ const x264_weight_t *weight = x264_weight_none; \ set_func_name( "get_ref_%dx%d", w_checked, h ); \ used_asm = 1; \ for( int i = 0; i < 1024; i++ ) \ pbuf3[i] = pbuf4[i] = 0xCD; \ - call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \ - ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \ + call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \ + ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, (intptr_t)64, dx, dy, w, h, weight ); \ for( int i = 0; i < h; i++ ) \ if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * sizeof(pixel) ) ) \ { \ @@ -1090,14 +1106,14 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; \ for( int i = 0; i < 1024; i++ ) \ pbuf3[i] = pbuf4[i] = 0xCD; \ - call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \ - call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \ + call_c( mc_c.mc_chroma, dst1, dst1+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \ + call_a( mc_a.mc_chroma, dst2, dst2+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \ /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \ for( int j = 0; j < h; j++ ) \ for( int i = w; i < 8; i++ ) \ { \ dst2[i+j*16+8] = dst1[i+j*16+8]; \ - dst2[i+j*16] = dst1[i+j*16]; \ + dst2[i+j*16 ] = dst1[i+j*16 ]; \ } \ if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \ { \ @@ -1149,15 +1165,15 @@ static int check_mc( int cpu_ref, int cpu_new ) { \ set_func_name( "%s_%s", #name, pixel_names[i] ); \ used_asm = 1; \ - call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \ - call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \ + call_c1( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ + call_a1( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \ { \ ok = 0; \ fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \ } \ - call_c2( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \ - call_a2( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \ + call_c2( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ + call_a2( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \ } \ } \ } @@ -1185,9 +1201,9 @@ static int check_mc( int cpu_ref, int cpu_new ) { \ set_func_name( "%s_w%d", #name, j ); \ used_asm = 1; \ - call_c1( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \ + call_c1( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ mc_a.weight_cache(&ha, &weight); \ - call_a1( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \ + call_a1( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ for( int k = 0; k < 16; k++ ) \ if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \ { \ @@ -1195,8 +1211,8 @@ static int check_mc( int cpu_ref, int cpu_new ) fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \ break; \ } \ - call_c2( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \ - call_a2( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \ + call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ + call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \ } \ } @@ -1248,8 +1264,8 @@ static int check_mc( int cpu_ref, int cpu_new ) used_asm = 1; memset( pbuf3, 0, 64*height ); memset( pbuf4, 0, 64*height ); - call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height ); - call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height ); + call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height ); + call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height ); if( memcmp( pbuf3, pbuf4, 64*height ) ) { ok = 0; @@ -1261,8 +1277,8 @@ static int check_mc( int cpu_ref, int cpu_new ) { set_func_name( "load_deinterleave_chroma_fenc" ); used_asm = 1; - call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height ); - call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height ); + call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, (intptr_t)64, height ); + call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, (intptr_t)64, height ); if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) ) { ok = 0; @@ -1274,8 +1290,8 @@ static int check_mc( int cpu_ref, int cpu_new ) { set_func_name( "load_deinterleave_chroma_fdec" ); used_asm = 1; - call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height ); - call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height ); + call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, (intptr_t)64, height ); + call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, (intptr_t)64, height ); if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) ) { ok = 0; @@ -1298,8 +1314,8 @@ static int check_mc( int cpu_ref, int cpu_new ) { int w = plane_specs[i].w; int h = plane_specs[i].h; - int src_stride = plane_specs[i].src_stride; - int dst_stride = (w + 127) & ~63; + intptr_t src_stride = plane_specs[i].src_stride; + intptr_t dst_stride = (w + 127) & ~63; assert( dst_stride * h <= 0x1000 ); pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); memset( pbuf3, 0, 0x1000*sizeof(pixel) ); @@ -1310,7 +1326,7 @@ static int check_mc( int cpu_ref, int cpu_new ) if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) ) { ok = 0; - fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } @@ -1324,8 +1340,8 @@ static int check_mc( int cpu_ref, int cpu_new ) { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; - int src_stride = (plane_specs[i].src_stride + 1) >> 1; - int dst_stride = (2*w + 127) & ~63; + intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1; + intptr_t dst_stride = (2*w + 127) & ~63; assert( dst_stride * h <= 0x1000 ); pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); memset( pbuf3, 0, 0x1000*sizeof(pixel) ); @@ -1336,7 +1352,7 @@ static int check_mc( int cpu_ref, int cpu_new ) if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) ) { ok = 0; - fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } @@ -1350,9 +1366,9 @@ static int check_mc( int cpu_ref, int cpu_new ) { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; - int dst_stride = w; - int src_stride = (2*w + 127) & ~63; - int offv = (dst_stride*h + 31) & ~15; + intptr_t dst_stride = w; + intptr_t src_stride = (2*w + 127) & ~63; + intptr_t offv = (dst_stride*h + 31) & ~15; memset( pbuf3, 0, 0x1000 ); memset( pbuf4, 0, 0x1000 ); call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h ); @@ -1362,7 +1378,7 @@ static int check_mc( int cpu_ref, int cpu_new ) memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) ) { ok = 0; - fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride ); + fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); break; } } @@ -1379,8 +1395,8 @@ static int check_mc( int cpu_ref, int cpu_new ) ok = 1; used_asm = 1; memset( pbuf3, 0, 4096 * sizeof(pixel) ); memset( pbuf4, 0, 4096 * sizeof(pixel) ); - call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp ); - call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp ); + call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, (intptr_t)64, 48, 10, tmp ); + call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, (intptr_t)64, 48, 10, tmp ); for( int i = 0; i < 3; i++ ) for( int j = 0; j < 10; j++ ) //FIXME ideally the first pixels would match too, but they aren't actually used @@ -1407,9 +1423,9 @@ static int check_mc( int cpu_ref, int cpu_new ) ok = 1; used_asm = 1; for( int w = 40; w <= 48; w += 8 ) { - int stride = (w+8)&~15; - call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 ); - call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 ); + intptr_t stride = (w+8)&~15; + call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], (intptr_t)w*2, stride, w, 16 ); + call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], (intptr_t)w*2, stride, w, 16 ); for( int i = 0; i < 16; i++ ) { for( int j = 0; j < 4; j++ ) @@ -1433,7 +1449,7 @@ static int check_mc( int cpu_ref, int cpu_new ) #define INTEGRAL_INIT( name, size, ... )\ if( mc_a.name != mc_ref.name )\ {\ - int stride = 80;\ + intptr_t stride = 80;\ set_func_name( #name );\ used_asm = 1;\ memcpy( buf3, buf1, size*2*stride );\ @@ -1496,7 +1512,7 @@ static int check_mc( int cpu_ref, int cpu_new ) { set_func_name( "memcpy_aligned" ); ok = 1; used_asm = 1; - for( int size = 16; size < 256; size += 16 ) + for( size_t size = 16; size < 256; size += 16 ) { memset( buf4, 0xAA, size + 1 ); call_c( mc_c.memcpy_aligned, buf3, buf1, size ); @@ -1504,7 +1520,7 @@ static int check_mc( int cpu_ref, int cpu_new ) if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) { ok = 0; - fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size ); + fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size ); break; } } @@ -1515,7 +1531,7 @@ static int check_mc( int cpu_ref, int cpu_new ) { set_func_name( "memzero_aligned" ); ok = 1; used_asm = 1; - for( int size = 128; size < 1024; size += 128 ) + for( size_t size = 128; size < 1024; size += 128 ) { memset( buf4, 0xAA, size + 1 ); call_c( mc_c.memzero_aligned, buf3, size ); @@ -1523,7 +1539,7 @@ static int check_mc( int cpu_ref, int cpu_new ) if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) { ok = 0; - fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size ); + fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size ); break; } } @@ -1561,7 +1577,7 @@ static int check_deblock( int cpu_ref, int cpu_new ) #define TEST_DEBLOCK( name, align, ... ) \ for( int i = 0; i < 36; i++ ) \ { \ - int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \ + intptr_t off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \ for( int j = 0; j < 1024; j++ ) \ /* two distributions of random to excersize different failure modes */ \ pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \ @@ -1570,16 +1586,16 @@ static int check_deblock( int cpu_ref, int cpu_new ) { \ set_func_name( #name ); \ used_asm = 1; \ - call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ - call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + call_c1( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + call_a1( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \ { \ ok = 0; \ fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \ break; \ } \ - call_c2( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ - call_a2( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + call_c2( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ + call_a2( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \ } \ } @@ -1935,11 +1951,11 @@ static int check_quant( int cpu_ref, int cpu_new ) memcpy( dct1, buf1, size*sizeof(dctcoef) ); memcpy( dct2, buf1, size*sizeof(dctcoef) ); memcpy( buf3+256, buf3, 256 ); - call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size ); + call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size ); call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size ); if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) ) ok = 0; - call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size ); + call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size ); call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size ); } }