Some x264 asm assumed that the high 32 bits of registers containing "int" values would be zero.
This is almost always the case, and it seems to work with gcc, but it is *not* guaranteed by the ABI.
As a result, it breaks with some other compilers, like Clang, that take advantage of this in optimizations.
Accordingly, fix all x86 code by using intptr_t instead of int or using movsxd where neccessary.
Also add checkasm hack to detect when assembly functions incorrectly assumes that 32-bit integers are zero-extended to 64-bit.
// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
// They also use nothing above armv5te, but we don't care about pre-armv6
-// void prefetch_ref( uint8_t *pix, int stride, int parity )
+// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
function x264_prefetch_ref_arm
sub r2, r2, #1
add r0, r0, #64
bx lr
.endfunc
-// void prefetch_fenc( uint8_t *pix_y, int stride_y,
-// uint8_t *pix_uv, int stride_uv, int mb_x )
+// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
+// uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
function x264_prefetch_fenc_arm
ldr ip, [sp]
push {lr}
.endfunc
-// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
+// void *x264_memcpy_aligned( void *dst, const void *src, size_t n )
function x264_memcpy_aligned_neon
orr r3, r0, r1, lsr #1
movrel ip, memcpy_table
.endfunc
-// void pixel_avg( uint8_t *dst, int dst_stride,
-// uint8_t *src1, int src1_stride,
-// uint8_t *src2, int src2_stride, int weight );
+// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
+// uint8_t *src1, intptr_t src1_stride,
+// uint8_t *src2, intptr_t src2_stride, int weight );
.macro AVGH w h
function x264_pixel_avg_\w\()x\h\()_neon
ldr ip, [sp, #8]
.endif
.endm
-// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride,
// const x264_weight_t *weight, int height )
function x264_mc_weight_w20_neon
weight_prologue full
weight_simple offsetsub, vqsub.u8
-// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
+// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
function x264_mc_copy_w4_neon
ldr ip, [sp]
copy_w4_loop:
.endfunc
-// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
-// uint8_t *src, int i_src_stride,
+// void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride,
+// uint8_t *src, intptr_t i_src_stride,
// int dx, int dy, int i_width, int i_height );
function x264_mc_chroma_neon
push {r4-r6, lr}
.endfunc
-// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
+// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width )
function x264_hpel_filter_v_neon
ldr ip, [sp]
sub r1, r1, r3, lsl #1
// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
-// uint8_t *dstc, int src_stride, int dst_stride, int width,
+// uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width,
// int height )
function x264_frame_init_lowres_core_neon
push {r4-r10,lr}
#include "common/common.h"
#include "mc.h"
-void x264_prefetch_ref_arm( uint8_t *, int, int );
-void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
-
-void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
-void x264_memzero_aligned_neon( void *dst, int n );
-
-void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-
-void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_neon( void *dst, size_t n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define MC_WEIGHT(func)\
-void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
-void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
-void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
-void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
\
-static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
{\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w4##func##_neon,\
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)
-void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
-void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
-void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
-void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
w->weightfn = x264_mc_wtab_neon;
}
-static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_neon,
x264_pixel_avg2_w20_neon,
};
-static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) =
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_neon,
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-static void mc_luma_neon( uint8_t *dst, int i_dst_stride,
- uint8_t *src[4], int i_src_stride,
+static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
+ uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
if ( (mvy&3) == 3 ) // explict if() to force conditional add
src1 += i_src_stride;
x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
}
-static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride,
- uint8_t *src[4], int i_src_stride,
+static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
+ uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
if ( (mvy&3) == 3 ) // explict if() to force conditional add
src1 += i_src_stride;
}
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
- int stride, int width, int height, int16_t *buf )
+ intptr_t stride, int width, int height, int16_t *buf )
{
- int realign = (intptr_t)src & 15;
+ intptr_t realign = (intptr_t)src & 15;
src -= realign;
dstv -= realign;
dstc -= realign;
DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
#define DECL_X4( name, suffix ) \
- DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\
- DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )
+ DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
+ DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
-int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
DECL_X1( sad, neon )
DECL_X1( sad_aligned, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
-int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
-uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
-uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
-int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
+uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
+int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
-uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int );
-uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int );
-uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
-void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
- const uint8_t *, int,
- int sums[2][4]);
+void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
+ const uint8_t *, intptr_t,
+ int sums[2][4] );
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
#endif
#define tc0_table(x) i_tc0_table[(x)+24]
/* From ffmpeg */
-static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc0 )
+static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 )
{
int p2 = pix[-3*xstride];
int p1 = pix[-2*xstride];
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
}
-static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
-static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
for( int d = 0; d < 8; d++, pix += stride )
deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
}
-static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
}
-static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
}
-static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc )
+static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
}
-static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
-static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
}
-static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
}
-static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
}
-static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
}
-static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta )
+static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
{
int p2 = pix[-3*xstride];
int p1 = pix[-2*xstride];
}
}
}
-static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
+static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < 16; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
}
-static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
+static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < 8; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, 1, alpha, beta );
}
-static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, stride, 1, alpha, beta );
}
-static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
-static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, int alpha, int beta )
+static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
}
-static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta )
+static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < height; d++, pix += ystride-2 )
for( int e = 0; e < width; e++, pix++ )
deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
}
-static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
}
-static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
}
-static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
}
-static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
}
}
}
-static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
+static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
+ int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp + a;
int index_b = i_qp + b;
pf_inter( pix, i_stride, alpha, beta, tc );
}
-static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
+static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
+ int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
{
int index_a = i_qp + a;
int index_b = i_qp + b;
}
#if HAVE_MMX
-void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
-void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#if ARCH_X86
-void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#if HIGH_BIT_DEPTH
-void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#else
// FIXME this wrapper has a significant cpu cost
-static void x264_deblock_v_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 );
x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
}
-static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta )
+static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta )
{
x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta );
x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
#endif
#if ARCH_PPC
-void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#endif // ARCH_PPC
#if HAVE_ARMV6
-void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
-void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
-void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
-void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#endif
void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
return frame;
}
-void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
- int i_width, int i_height, x264_weight_t *w )
+void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
+ int i_width, int i_height, x264_weight_t *w )
{
/* Weight horizontal strips of height 16. This was found to be the optimal height
* in terms of the cache loads. */
x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */
} x264_sync_frame_list_t;
-typedef void (*x264_deblock_inter_t)( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-typedef void (*x264_deblock_intra_t)( pixel *pix, int stride, int alpha, int beta );
+typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta );
typedef struct
{
x264_deblock_inter_t deblock_luma[2];
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
-void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
+void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
int i_width, int i_height, x264_weight_t *w );
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void x264_frame_delete_list( x264_frame_t **list );
int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
- int i_stride0 = 16, i_stride1 = 16;
+ intptr_t i_stride0 = 16, i_stride1 = 16;
ALIGNED_ARRAY_16( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
pixel *src0, *src1;
#endif
-static inline void pixel_avg( pixel *dst, int i_dst_stride,
- pixel *src1, int i_src1_stride,
- pixel *src2, int i_src2_stride,
- int i_width, int i_height )
+static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride,
+ pixel *src1, intptr_t i_src1_stride,
+ pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
{
}
}
-static inline void pixel_avg_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height )
+static inline void pixel_avg_wxh( pixel *dst, intptr_t i_dst,
+ pixel *src1, intptr_t i_src1,
+ pixel *src2, intptr_t i_src2, int width, int height )
{
for( int y = 0; y < height; y++ )
{
/* Implicit weighted bipred only:
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
-static inline void pixel_avg_weight_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height, int i_weight1 )
+static inline void pixel_avg_weight_wxh( pixel *dst, intptr_t i_dst,
+ pixel *src1, intptr_t i_src1,
+ pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
{
- const int i_weight2 = 64 - i_weight1;
+ int i_weight2 = 64 - i_weight1;
for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
for( int x = 0; x<width; x++ )
dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
#undef op_scale2
#define PIXEL_AVG_C( name, width, height ) \
-static void name( pixel *pix1, int i_stride_pix1, \
- pixel *pix2, int i_stride_pix2, \
- pixel *pix3, int i_stride_pix3, int weight ) \
+static void name( pixel *pix1, intptr_t i_stride_pix1, \
+ pixel *pix2, intptr_t i_stride_pix2, \
+ pixel *pix3, intptr_t i_stride_pix3, int weight ) \
{ \
if( weight == 32 ) \
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
}
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
-static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
+ const x264_weight_t *weight, int i_width, int i_height )
{
int offset = weight->i_offset << (BIT_DEPTH-8);
int scale = weight->i_scale;
}
#define MC_WEIGHT_C( name, width ) \
- static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \
+ static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
{ \
mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
}
mc_weight_w20,
};
const x264_weight_t x264_weight_none[3] = { {{0}} };
-static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, int i_width, int i_height )
+static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
{
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
- int stride, int width, int height, int16_t *buf )
+ intptr_t stride, int width, int height, int16_t *buf )
{
const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
for( int y = 0; y < height; y++ )
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-static void mc_luma( pixel *dst, int i_dst_stride,
- pixel *src[4], int i_src_stride,
+static void mc_luma( pixel *dst, intptr_t i_dst_stride,
+ pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}
-static pixel *get_ref( pixel *dst, int *i_dst_stride,
- pixel *src[4], int i_src_stride,
+static pixel *get_ref( pixel *dst, intptr_t *i_dst_stride,
+ pixel *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
}
/* full chroma mc (ie until 1/8 pixel)*/
-static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride,
- pixel *src, int i_src_stride,
+static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
+ pixel *src, intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
}
#define MC_COPY(W) \
-static void mc_copy_w##W( pixel *dst, int i_dst, pixel *src, int i_src, int i_height ) \
+static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
{ \
mc_copy( src, i_src, dst, i_dst, W, i_height ); \
}
MC_COPY( 8 )
MC_COPY( 4 )
-void x264_plane_copy_c( pixel *dst, int i_dst,
- pixel *src, int i_src, int w, int h )
+void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
+ pixel *src, intptr_t i_src, int w, int h )
{
while( h-- )
{
}
}
-void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h )
+void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
+ pixel *srcu, intptr_t i_srcu,
+ pixel *srcv, intptr_t i_srcv, int w, int h )
{
for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
for( int x=0; x<w; x++ )
}
}
-static void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
- pixel *dstv, int i_dstv,
- pixel *src, int i_src, int w, int h )
+static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
+ pixel *dstv, intptr_t i_dstv,
+ pixel *src, intptr_t i_src, int w, int h )
{
for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
for( int x=0; x<w; x++ )
}
}
-static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
- pixel *dstb, int i_dstb,
- pixel *dstc, int i_dstc,
- pixel *src, int i_src, int pw, int w, int h )
+static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
+ pixel *dstb, intptr_t i_dstb,
+ pixel *dstc, intptr_t i_dstc,
+ pixel *src, intptr_t i_src, int pw, int w, int h )
{
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
{
}
}
-static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height )
+static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
{
for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
for( int x=0; x<8; x++ )
}
}
-static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
+static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
{
x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
}
-static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
+static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
{
x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
}
-static void prefetch_fenc_null( pixel *pix_y, int stride_y,
- pixel *pix_uv, int stride_uv, int mb_x )
+static void prefetch_fenc_null( pixel *pix_y, intptr_t stride_y,
+ pixel *pix_uv, intptr_t stride_uv, int mb_x )
{}
-static void prefetch_ref_null( pixel *pix, int stride, int parity )
+static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
{}
-static void memzero_aligned( void * dst, int n )
+static void memzero_aligned( void * dst, size_t n )
{
memset( dst, 0, n );
}
-static void integral_init4h( uint16_t *sum, pixel *pix, int stride )
+static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
{
int v = pix[0]+pix[1]+pix[2]+pix[3];
for( int x = 0; x < stride-4; x++ )
}
}
-static void integral_init8h( uint16_t *sum, pixel *pix, int stride )
+static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
{
int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
for( int x = 0; x < stride-8; x++ )
}
}
-static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
{
for( int x = 0; x < stride-8; x++ )
sum4[x] = sum8[x+4*stride] - sum8[x];
sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
}
-static void integral_init8v( uint16_t *sum8, int stride )
+static void integral_init8v( uint16_t *sum8, intptr_t stride )
{
for( int x = 0; x < stride-8; x++ )
sum8[x] = sum8[x+8*stride] - sum8[x];
}
static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
- int src_stride, int dst_stride, int width, int height )
+ intptr_t src_stride, intptr_t dst_stride, int width, int height )
{
for( int y = 0; y < height; y++ )
{
#define X264_MC_H
struct x264_weight_t;
-typedef void (* weight_fn_t)( pixel *, int, pixel *,int, const struct x264_weight_t *, int );
+typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
typedef struct x264_weight_t
{
/* aligning the first member is a gcc hack to force the struct to be
typedef struct
{
- void (*mc_luma)( pixel *dst, int i_dst, pixel **src, int i_src,
+ void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src,
int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
/* may round up the dimensions if they're not a power of 2 */
- pixel* (*get_ref)( pixel *dst, int *i_dst, pixel **src, int i_src,
+ pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src,
int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
- void (*mc_chroma)( pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
+ void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,
int mvx, int mvy, int i_width, int i_height );
- void (*avg[12])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
+ void (*avg[12])( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+ pixel *src2, intptr_t src2_stride, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
- void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
- void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );
-
- void (*store_interleave_chroma)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
- void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, int i_src, int height );
- void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, int i_src, int height );
-
- void (*plane_copy)( pixel *dst, int i_dst,
- pixel *src, int i_src, int w, int h );
- void (*plane_copy_interleave)( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h );
+ void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
+ void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
+
+ void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+ void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height );
+ void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
+
+ void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
+ void (*plane_copy_interleave)( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
+ pixel *srcv, intptr_t i_srcv, int w, int h );
/* may write up to 15 pixels off the end of each plane */
- void (*plane_copy_deinterleave)( pixel *dstu, int i_dstu,
- pixel *dstv, int i_dstv,
- pixel *src, int i_src, int w, int h );
- void (*plane_copy_deinterleave_rgb)( pixel *dsta, int i_dsta,
- pixel *dstb, int i_dstb,
- pixel *dstc, int i_dstc,
- pixel *src, int i_src, int pw, int w, int h );
+ void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
+ pixel *src, intptr_t i_src, int w, int h );
+ void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
+ pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
- int i_stride, int i_width, int i_height, int16_t *buf );
+ intptr_t i_stride, int i_width, int i_height, int16_t *buf );
/* prefetch the next few macroblocks of fenc or fdec */
- void (*prefetch_fenc)( pixel *pix_y, int stride_y,
- pixel *pix_uv, int stride_uv, int mb_x );
- void (*prefetch_fenc_420)( pixel *pix_y, int stride_y,
- pixel *pix_uv, int stride_uv, int mb_x );
- void (*prefetch_fenc_422)( pixel *pix_y, int stride_y,
- pixel *pix_uv, int stride_uv, int mb_x );
+ void (*prefetch_fenc) ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
+ void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
+ void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
- void (*prefetch_ref)( pixel *pix, int stride, int parity );
+ void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity );
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
- void (*memzero_aligned)( void *dst, int n );
+ void (*memzero_aligned)( void *dst, size_t n );
/* successive elimination prefilter */
- void (*integral_init4h)( uint16_t *sum, pixel *pix, int stride );
- void (*integral_init8h)( uint16_t *sum, pixel *pix, int stride );
- void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
- void (*integral_init8v)( uint16_t *sum8, int stride );
+ void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride );
+ void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride );
+ void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+ void (*integral_init8v)( uint16_t *sum8, intptr_t stride );
void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
- int src_stride, int dst_stride, int width, int height );
+ intptr_t src_stride, intptr_t dst_stride, int width, int height );
weight_fn_t *weight;
weight_fn_t *offsetadd;
weight_fn_t *offsetsub;
* pixel_sad_WxH
****************************************************************************/
#define PIXEL_SAD_C( name, lx, ly ) \
-static int name( pixel *pix1, int i_stride_pix1, \
- pixel *pix2, int i_stride_pix2 ) \
+static int name( pixel *pix1, intptr_t i_stride_pix1, \
+ pixel *pix2, intptr_t i_stride_pix2 ) \
{ \
int i_sum = 0; \
for( int y = 0; y < ly; y++ ) \
* pixel_ssd_WxH
****************************************************************************/
#define PIXEL_SSD_C( name, lx, ly ) \
-static int name( pixel *pix1, int i_stride_pix1, \
- pixel *pix2, int i_stride_pix2 ) \
+static int name( pixel *pix1, intptr_t i_stride_pix1, \
+ pixel *pix2, intptr_t i_stride_pix2 ) \
{ \
int i_sum = 0; \
for( int y = 0; y < ly; y++ ) \
PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 )
PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 )
-uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
+uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1,
+ pixel *pix2, intptr_t i_pix2, int i_width, int i_height )
{
uint64_t i_ssd = 0;
int y;
return i_ssd;
}
-static void pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
+static void pixel_ssd_nv12_core( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2,
+ int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
{
*ssd_u = 0, *ssd_v = 0;
for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
}
}
-void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
+void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+ int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
{
pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v );
if( i_width&7 )
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, h ) \
-static uint64_t name( pixel *pix, int i_stride ) \
+static uint64_t name( pixel *pix, intptr_t i_stride ) \
{ \
uint32_t sum = 0, sqr = 0; \
for( int y = 0; y < h; y++ ) \
* pixel_var2_wxh
****************************************************************************/
#define PIXEL_VAR2_C( name, w, h, shift ) \
-static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \
+static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
for( int y = 0; y < h; y++ ) \
* pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
****************************************************************************/
-static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
{
sum2_t tmp[4][2];
sum2_t a0, a1, a2, a3, b0, b1;
return sum >> 1;
}
-static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
{
sum2_t tmp[4][4];
sum2_t a0, a1, a2, a3;
}
#define PIXEL_SATD_C( w, h, sub )\
-static int x264_pixel_satd_##w##x##h( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )\
+static int x264_pixel_satd_##w##x##h( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )\
{\
int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
+ sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
PIXEL_SATD_C( 4, 16, x264_pixel_satd_4x4 )
PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 )
-static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static NOINLINE int sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
{
sum2_t tmp[8][4];
sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
return sum;
}
-static int x264_pixel_sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
{
int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
return (sum+2)>>2;
}
-static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
{
int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
+ sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
}
-static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride )
+static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
{
sum2_t tmp[32];
sum2_t a0, a1, a2, a3, dc;
}
#define HADAMARD_AC(w,h) \
-static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, int stride )\
+static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, intptr_t stride )\
{\
uint64_t sum = pixel_hadamard_ac( pix, stride );\
if( w==16 )\
* pixel_sad_x4
****************************************************************************/
#define SAD_X( size ) \
-static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
+static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
+ intptr_t i_stride, int scores[3] )\
{\
scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
}\
-static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
+static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3,\
+ intptr_t i_stride, int scores[4] )\
{\
scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
****************************************************************************/
#define SATD_X( size, cpu ) \
-static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
+static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
+ intptr_t i_stride, int scores[3] )\
{\
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
}\
-static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
+static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3,\
+ intptr_t i_stride, int scores[4] )\
{\
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
/****************************************************************************
* structural similarity metric
****************************************************************************/
-static void ssim_4x4x2_core( const pixel *pix1, int stride1,
- const pixel *pix2, int stride2,
- int sums[2][4])
+static void ssim_4x4x2_core( const pixel *pix1, intptr_t stride1,
+ const pixel *pix2, intptr_t stride2,
+ int sums[2][4] )
{
for( int z = 0; z < 2; z++ )
{
}
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
- pixel *pix1, int stride1,
- pixel *pix2, int stride2,
+ pixel *pix1, intptr_t stride1,
+ pixel *pix2, intptr_t stride2,
int width, int height, void *buf, int *cnt )
{
int z = 0;
return ssim;
}
-static int pixel_vsad( pixel *src, int stride, int height )
+static int pixel_vsad( pixel *src, intptr_t stride, int height )
{
int score = 0;
for( int i = 1; i < height; i++, src += stride )
// SSD assumes all args aligned
// other cmp functions assume first arg aligned
-typedef int (*x264_pixel_cmp_t) ( pixel *, int, pixel *, int );
-typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, int, int[3] );
-typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int[4] );
+typedef int (*x264_pixel_cmp_t) ( pixel *, intptr_t, pixel *, intptr_t );
+typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, intptr_t, int[3] );
+typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int[4] );
enum
{
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
- int (*vsad)( pixel *, int, int );
+ int (*vsad)( pixel *, intptr_t, int );
- uint64_t (*var[4])( pixel *pix, int stride );
- int (*var2[4])( pixel *pix1, int stride1,
- pixel *pix2, int stride2, int *ssd );
- uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
+ uint64_t (*var[4])( pixel *pix, intptr_t stride );
+ int (*var2[4])( pixel *pix1, intptr_t stride1,
+ pixel *pix2, intptr_t stride2, int *ssd );
+ uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride );
- void (*ssd_nv12_core)( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width, int height,
+ void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1,
+ pixel *pixuv2, intptr_t stride2, int width, int height,
uint64_t *ssd_u, uint64_t *ssd_v );
- void (*ssim_4x4x2_core)( const pixel *pix1, int stride1,
- const pixel *pix2, int stride2, int sums[2][4] );
+ void (*ssim_4x4x2_core)( const pixel *pix1, intptr_t stride1,
+ const pixel *pix2, intptr_t stride2, int sums[2][4] );
float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
/* multiple parallel calls to cmp. */
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
-void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v );
-uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
-float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf, int *cnt );
+void x264_pixel_ssd_nv12 ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+ int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v );
+uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+ int i_width, int i_height );
+float x264_pixel_ssim_wxh ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+ int i_width, int i_height, void *buf, int *cnt );
int x264_field_vsad( x264_t *h, int mb_x, int mb_y );
#endif
q1 = newq1; \
}
-void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0 )
{
}
}
-void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
register vec_u8_t line0, line1, line2, line3, line4, line5;
#include "ppccommon.h"
#if !HIGH_BIT_DEPTH
-typedef void (*pf_mc_t)( uint8_t *src, int i_src,
- uint8_t *dst, int i_dst, int i_height );
+typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
+ uint8_t *dst, intptr_t i_dst, int i_height );
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
}
-static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst,
+ uint8_t *src1, intptr_t i_src1,
uint8_t *src2, int i_height )
{
for( int y = 0; y < i_height; y++ )
}
}
-static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, intptr_t i_dst,
+ uint8_t *src1, intptr_t i_src1,
uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
}
}
-static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, intptr_t i_dst,
+ uint8_t *src1, intptr_t i_src1,
uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
}
}
-static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, intptr_t i_dst,
+ uint8_t *src1, intptr_t i_src1,
uint8_t *src2, int i_height )
{
x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
/* mc_copy: plain c */
#define MC_COPY( name, a ) \
-static void name( uint8_t *dst, int i_dst, \
- uint8_t *src, int i_src, int i_height ) \
+static void name( uint8_t *dst, intptr_t i_dst, \
+ uint8_t *src, intptr_t i_src, int i_height ) \
{ \
int y; \
for( y = 0; y < i_height; y++ ) \
MC_COPY( x264_mc_copy_w4_altivec, 4 )
MC_COPY( x264_mc_copy_w8_altivec, 8 )
-static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
- uint8_t *src, int i_src, int i_height )
+static void x264_mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src, int i_height )
{
vec_u8_t cpyV;
PREP_LOAD;
PREP_LOAD_SRC( src );
- for( int y = 0; y < i_height; y++)
+ for( int y = 0; y < i_height; y++ )
{
VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
vec_st(cpyV, 0, dst);
}
-static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst,
- uint8_t *src, int i_src, int i_height )
+static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst,
+ uint8_t *src, intptr_t i_src, int i_height )
{
- for( int y = 0; y < i_height; ++y)
+ for( int y = 0; y < i_height; ++y )
{
- vec_u8_t cpyV = vec_ld( 0, src);
+ vec_u8_t cpyV = vec_ld( 0, src );
vec_st(cpyV, 0, dst);
src += i_src;
}
-static void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
- uint8_t *src[4], int i_src_stride,
+static void mc_luma_altivec( uint8_t *dst, intptr_t i_dst_stride,
+ uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
-static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride,
- uint8_t *src[4], int i_src_stride,
+static uint8_t *get_ref_altivec( uint8_t *dst, intptr_t *i_dst_stride,
+ uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
- int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
}
}
-static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
- uint8_t *src, int i_src_stride,
- int mvx, int mvy,
- int i_height )
+static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+ uint8_t *src, intptr_t i_src_stride,
+ int mvx, int mvy, int i_height )
{
uint8_t *srcp;
int d8x = mvx&0x07;
}
}
-static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
- uint8_t *src, int i_src_stride,
- int mvx, int mvy,
- int i_height )
+static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+ uint8_t *src, intptr_t i_src_stride,
+ int mvx, int mvy, int i_height )
{
uint8_t *srcp;
int d8x = mvx & 0x07;
}
}
-static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
- uint8_t *src, int i_src_stride,
- int mvx, int mvy,
- int i_height )
+static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+ uint8_t *src, intptr_t i_src_stride,
+ int mvx, int mvy, int i_height )
{
uint8_t *srcp;
int d8x = mvx & 0x07;
}
}
-static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
- uint8_t *src, int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height )
+static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+ uint8_t *src, intptr_t i_src_stride,
+ int mvx, int mvy, int i_width, int i_height )
{
if( i_width == 8 )
mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride,
}
void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
- int i_stride, int i_width, int i_height, int16_t *buf )
+ intptr_t i_stride, int i_width, int i_height, int16_t *buf )
{
vec_u8_t destv;
vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
}
static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
- int src_stride, int dst_stride, int width, int height )
+ intptr_t src_stride, intptr_t dst_stride, int width, int height )
{
int w = width >> 4;
int end = (width & 15);
}
}
-static void mc_weight_w2_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
}
}
}
-static void mc_weight_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
}
}
}
-static void mc_weight_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
}
}
}
-static void mc_weight_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
}
}
}
-static void mc_weight_w20_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
**********************************************************************/
#define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \
-static int name( uint8_t *pix1, int i_pix1, \
- uint8_t *pix2, int i_pix2 ) \
+static int name( uint8_t *pix1, intptr_t i_pix1, \
+ uint8_t *pix2, intptr_t i_pix2 ) \
{ \
ALIGNED_16( int sum ); \
\
/***********************************************************************
* SATD 4x4
**********************************************************************/
-static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
ALIGNED_16( int i_satd );
/***********************************************************************
* SATD 4x8
**********************************************************************/
-static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
ALIGNED_16( int i_satd );
/***********************************************************************
* SATD 8x4
**********************************************************************/
-static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
ALIGNED_16( int i_satd );
/***********************************************************************
* SATD 8x8
**********************************************************************/
-static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
ALIGNED_16( int i_satd );
/***********************************************************************
* SATD 8x16
**********************************************************************/
-static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
ALIGNED_16( int i_satd );
/***********************************************************************
* SATD 16x8
**********************************************************************/
-static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
ALIGNED_16( int i_satd );
/***********************************************************************
* SATD 16x16
**********************************************************************/
-static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
ALIGNED_16( int i_satd );
static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
uint8_t *pix0, uint8_t *pix1,
uint8_t *pix2, uint8_t *pix3,
- int i_stride, int scores[4] )
+ intptr_t i_stride, int scores[4] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
- int i_stride, int scores[3] )
+ intptr_t i_stride, int scores[3] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
scores[2] = sum2;
}
-static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
+static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,
+ uint8_t *pix3, intptr_t i_stride, int scores[4] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
- int i_stride, int scores[3] )
+ intptr_t i_stride, int scores[3] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
uint8_t *pix0, uint8_t *pix1,
uint8_t *pix2, uint8_t *pix3,
- int i_stride, int scores[4] )
+ intptr_t i_stride, int scores[4] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
- int i_stride, int scores[3] )
+ intptr_t i_stride, int scores[3] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
uint8_t *pix0, uint8_t *pix1,
uint8_t *pix2, uint8_t *pix3,
- int i_stride, int scores[4] )
+ intptr_t i_stride, int scores[4] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
uint8_t *pix1, uint8_t *pix2,
- int i_stride, int scores[3] )
+ intptr_t i_stride, int scores[3] )
{
ALIGNED_16( int sum0 );
ALIGNED_16( int sum1 );
* SSD routines
**********************************************************************/
-static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
- uint8_t *pix2, int i_stride_pix2)
+static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
+ uint8_t *pix2, intptr_t i_stride_pix2 )
{
ALIGNED_16( int sum );
return sum;
}
-static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
- uint8_t *pix2, int i_stride_pix2)
+static int pixel_ssd_8x8_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
+ uint8_t *pix2, intptr_t i_stride_pix2 )
{
ALIGNED_16( int sum );
/****************************************************************************
* variance
****************************************************************************/
-static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride )
{
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
return sum + ((uint64_t)sqr<<32);
}
-static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride )
{
ALIGNED_16(uint32_t sum_tab[4]);
ALIGNED_16(uint32_t sqr_tab[4]);
sa8d7v = vec_sub(b6v, b7v); \
}
-static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
int32_t i_satd=0;
return i_satd;
}
-static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
int32_t i_satd;
i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
return i_satd;
}
-static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
- uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
+ uint8_t *pix2, intptr_t i_pix2 )
{
int32_t i_satd;
- i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
- + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
+ i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
+ + pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
+ pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 )
+ pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
return i_satd;
vec_s16_t pix16_s##num = (vec_s16_t)vec_perm(pix8_##num, zero_u8v, perm); \
vec_s16_t pix16_d##num;
-static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
+static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm )
{
ALIGNED_16( int32_t sum4_tab[4] );
ALIGNED_16( int32_t sum8_tab[4] );
0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
};
-static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride )
{
int idx = ((uintptr_t)pix & 8) >> 3;
vec_u8_t permh = hadamard_permtab[idx];
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}
-static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride )
{
int idx = ((uintptr_t)pix & 8) >> 3;
vec_u8_t permh = hadamard_permtab[idx];
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}
-static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride )
{
vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
}
-static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride )
{
vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
/****************************************************************************
* structural similarity metric
****************************************************************************/
-static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
- const uint8_t *pix2, int stride2,
+static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1,
+ const uint8_t *pix2, intptr_t stride2,
int sums[2][4] )
{
ALIGNED_16( int temp[4] );
}
#define SATD_X( size ) \
-static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
+ intptr_t i_stride, int scores[3] )\
{\
scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
}\
-static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
+ uint8_t *pix3, intptr_t i_stride, int scores[4] )\
{\
scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
#ifndef X264_SPARC_PIXEL_H
#define X264_SPARC_PIXEL_H
-int x264_pixel_sad_8x8_vis( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_8x16_vis( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_16x8_vis( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_16x16_vis( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_8x16_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_16x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_16x16_vis( uint8_t *, intptr_t, uint8_t *, intptr_t );
#endif
%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma, 5,5,8
%assign pad 5*mmsize+12-(stack_offset&15)
%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA_INTRA_64 0
cglobal deblock_v_luma_intra, 4,7,16
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,16
%define t0 m15
%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra, 4,7,8
LUMA_INTRA_INIT 3
RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,8
LUMA_INTRA_INIT 8
%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA 0
cglobal deblock_v_luma, 5,5,10
RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 5,9
- movsxd r7, r1d
- lea r8, [r7*3]
+ lea r8, [r1*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
%if WIN64
%endif
; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
- lea r6, [r6+r7*8]
- lea r5, [r5+r7*8]
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp
+ lea r6, [r6+r1*8]
+ lea r5, [r5+r1*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
+ mov r7, r1
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%if WIN64
%macro DEBLOCK_LUMA 2
;-----------------------------------------------------------------------------
-; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma, 5,5
lea r4, [r1*3]
RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 0,5
%endif
;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra, 4,6,16
%if ARCH_X86_64 == 0
INIT_MMX cpuname
%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,9
- movsxd r7, r1d
- lea r8, [r7*3]
+ lea r8, [r1*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea r6, [r6+r7*8]
- lea r5, [r5+r7*8]
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
-
- lea r0, [pix_tmp+0x40]
- mov r1, 0x10
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r6, [r6+r1*8]
+ lea r5, [r5+r1*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+ mov r7, r1
+ lea r0, [pix_tmp+0x40]
+ mov r1, 0x10
call deblock_v_luma_intra
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
ret
;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma, 7,7,8
+cglobal deblock_v_chroma, 5,7,8
FIX_STRIDES r1
mov r5, r0
sub r0, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
add r1, r1
ret
;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_mbaff, 5,7,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422_intra, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422, 5,7,8
add r1, r1
ret
;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma, 5,6,8
CHROMA_V_START
RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
CHROMA_H_START
%endif
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro DEBLOCK_H_CHROMA_420_MBAFF 0
cglobal deblock_h_chroma_mbaff, 5,7,8
%macro DEBLOCK_CHROMA_INTRA 0
;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,5,8
CHROMA_V_START
RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
CHROMA_H_START
%endif
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
%if WIN64
DECLARE_REG_TMP 0,1,2,3,4,5,4,5
%macro AVG_START 0-1 0
- PROLOGUE 5,7,%1
- movsxd r5, dword r5m
+ PROLOGUE 6,7,%1
%endmacro
%elif UNIX64
DECLARE_REG_TMP 0,1,2,3,4,5,7,8
%endif ;HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
+; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 1-2 0
cglobal pixel_avg_weight_w%1
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
+;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
%macro WEIGHTER 1
%endmacro
;-----------------------------------------------------------------------------
-;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
+;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
%macro OFFSET 2
cglobal mc_offset%2_w%1, 6,6
;=============================================================================
;-----------------------------------------------------------------------------
-; void pixel_avg_4x4( pixel *dst, int dst_stride,
-; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
+; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+; pixel *src2, intptr_t src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 2
cglobal pixel_avg_%1x%2
%endmacro
;-----------------------------------------------------------------------------
-; void pixel_avg_w4( pixel *dst, int dst_stride,
-; pixel *src1, int src1_stride, pixel *src2, int src2_stride,
-; int height, int weight );
+; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+; pixel *src2, intptr_t src2_stride, int height, int weight );
;-----------------------------------------------------------------------------
%macro AVG_FUNC 3
%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void pixel_avg2_wN( uint16_t *dst, int dst_stride,
-; uint16_t *src1, int src_stride,
+; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
+; uint16_t *src1, intptr_t src_stride,
; uint16_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W_ONE 1
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
-; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src_stride,
+; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src1, intptr_t src_stride,
; uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
%endmacro
;-----------------------------------------------------------------------------
-; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
+; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
+; uint8_t *src, intptr_t i_src_stride, int i_height )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal mc_copy_w4_mmx, 4,6
; FIXME doesn't cover all pixels in high depth and/or 4:4:4
;-----------------------------------------------------------------------------
-; void prefetch_fenc( pixel *pix_y, int stride_y,
-; pixel *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
+; pixel *pix_uv, intptr_t stride_uv, int mb_x )
;-----------------------------------------------------------------------------
%macro PREFETCH_FENC 1
%if ARCH_X86_64
cglobal prefetch_fenc_%1, 5,5
- FIX_STRIDES r1d, r3d
+ FIX_STRIDES r1, r3
and r4d, 3
mov eax, r4d
imul r4d, r1d
PREFETCH_FENC 422
;-----------------------------------------------------------------------------
-; void prefetch_ref( pixel *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal prefetch_ref, 3,3
- FIX_STRIDES r1d
+ FIX_STRIDES r1
dec r2d
and r2d, r1d
lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
-; uint8_t *src, int src_stride,
+; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
; int dx, int dy,
; int width, int height )
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
;-----------------------------------------------------------------------------
%macro HPEL_FILTER 0
cglobal hpel_filter_v, 5,6,11
- FIX_STRIDES r3d, r4d
-%if WIN64
- movsxd r4, r4d
-%endif
+ FIX_STRIDES r3, r4
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
%define s30 [pad30]
%endif
add r0, r4
- lea r2, [r2+r4]
+ add r2, r4
neg r4
mova m7, [pw_pixel_max]
pxor m0, m0
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,10
add r2, r2
add r0, r2
- lea r1, [r1+r2]
+ add r1, r2
neg r2
mova m0, [tap1]
mova m7, [tap3]
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
+; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h, 3,4,8
%define src r1+r2
%if HIGH_BIT_DEPTH == 0
%macro HPEL_V 1
;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_v, 5,6,%1
-%if WIN64
- movsxd r4, r4d
-%endif
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
%endmacro
;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
INIT_MMX
cglobal hpel_filter_c_mmx2, 3,3
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_mmx2, 3,3
add r0, r2
%macro HPEL_C 0
;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,9
add r0, r2
%endmacro
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_sse2, 3,3,8
add r0, r2
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
%macro HPEL_H 0
cglobal hpel_filter_h, 3,3
%macro HPEL 0
;-----------------------------------------------------------------------------
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-; uint8_t *src, int stride, int width, int height)
+; uint8_t *src, intptr_t stride, int width, int height )
;-----------------------------------------------------------------------------
cglobal hpel_filter, 7,9,16
-%if WIN64
- movsxd r4, r4d
- movsxd r5, r5d
-%endif
mov r7, r3
- sub r5, 16
+ sub r5d, 16
mov r8, r1
and r7, 15
sub r3, r7
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void plane_copy_core( pixel *dst, int i_dst,
-; pixel *src, int i_src, int w, int h)
+; void plane_copy_core( pixel *dst, intptr_t i_dst,
+; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
INIT_MMX
cglobal plane_copy_core_mmx2, 6,7
- FIX_STRIDES r1d, r3d, r4d
- movsxdifnidn r1, r1d
- movsxdifnidn r3, r3d
+ FIX_STRIDES r1, r3, r4d
+%if HIGH_BIT_DEPTH == 0
movsxdifnidn r4, r4d
+%endif
sub r1, r4
sub r3, r4
.loopy:
- mov r6d, r4d
- sub r6d, 63
+ lea r6d, [r4-63]
.loopx:
prefetchnta [r2+256]
movq m0, [r2 ]
%macro PLANE_INTERLEAVE 0
;-----------------------------------------------------------------------------
-; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
-; uint8_t *srcu, int i_srcu,
-; uint8_t *srcv, int i_srcv, int w, int h )
+; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
+; uint8_t *srcu, intptr_t i_srcu,
+; uint8_t *srcv, intptr_t i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core, 7,9
- FIX_STRIDES r1d, r3d, r5d, r6d
+cglobal plane_copy_interleave_core, 6,9
+ mov r6d, r6m
%if HIGH_BIT_DEPTH
- mov r1m, r1d
- mov r3m, r3d
- mov r6m, r6d
+ FIX_STRIDES r1, r3, r5, r6d
+ movifnidn r1mp, r1
+ movifnidn r3mp, r3
+ mov r6m, r6d
%endif
- movsxdifnidn r1, r1d
- movsxdifnidn r3, r3d
- movsxdifnidn r5, r5d
- movsxdifnidn r6, r6d
lea r0, [r0+r6*2]
add r2, r6
add r4, r6
RET
;-----------------------------------------------------------------------------
-; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
+; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
;-----------------------------------------------------------------------------
cglobal store_interleave_chroma, 5,5
- FIX_STRIDES r1d
+ FIX_STRIDES r1
.loop:
INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
-; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
-; pixel *dstv, int i_dstv,
-; pixel *src, int i_src, int w, int h )
+; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
+; pixel *dstv, intptr_t i_dstv,
+; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
cglobal plane_copy_deinterleave, 6,7
DEINTERLEAVE_START
mov r6d, r6m
- FIX_STRIDES r1d, r3d, r5d, r6d
+ FIX_STRIDES r1, r3, r5, r6d
%if HIGH_BIT_DEPTH
mov r6m, r6d
%endif
- movsxdifnidn r1, r1d
- movsxdifnidn r3, r3d
- movsxdifnidn r5, r5d
add r0, r6
add r2, r6
lea r4, [r4+r6*2]
REP_RET
;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fenc, 4,4
DEINTERLEAVE_START
- FIX_STRIDES r2d
+ FIX_STRIDES r2
.loop:
DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
REP_RET
;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fdec, 4,4
DEINTERLEAVE_START
- FIX_STRIDES r2d
+ FIX_STRIDES r2
.loop:
DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
-; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal integral_init4h_sse4, 3,4
%macro INTEGRAL_INIT_8V 0
;-----------------------------------------------------------------------------
-; void integral_init8v( uint16_t *sum8, int stride )
+; void integral_init8v( uint16_t *sum8, intptr_t stride )
;-----------------------------------------------------------------------------
cglobal integral_init8v, 3,3
shl r1, 1
INTEGRAL_INIT_8V
;-----------------------------------------------------------------------------
-; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal integral_init4v_mmx, 3,5
;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-; int src_stride, int dst_stride, int width, int height )
+; intptr_t src_stride, intptr_t dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 0
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
%if HIGH_BIT_DEPTH
shl dword r6m, 1
- FIX_STRIDES r5d
+ FIX_STRIDES r5
shl dword r7m, 1
-%endif
-%if WIN64
- movsxd r5, r5d
%endif
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
void func##_sse2 args;\
void func##_ssse3 args;
-DECL_SUF( x264_pixel_avg_16x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_16x8, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_8x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_8x8, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_8x4, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x8, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x4, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x2, ( pixel *, int, pixel *, int, pixel *, int, int ))
+DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_8x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_8x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_8x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x2, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
#define MC_WEIGHT(w,type) \
- void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
+ void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
#define MC_WEIGHT_OFFSET(w,type) \
- void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
- void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
+ void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
+ void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
MC_WEIGHT(w,type)
MC_WEIGHT_OFFSET( 4, mmx2 )
#undef MC_OFFSET
#undef MC_WEIGHT
-void x264_mc_copy_w4_mmx( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w8_mmx( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_420_mmx2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_422_mmx2( pixel *, int, pixel *, int, int );
-void x264_prefetch_ref_mmx2( pixel *, int, int );
-void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
-void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
-void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_core_avx( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
- pixel *srcu, int i_srcu,
- pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_deinterleave_mmx( pixel *dstu, int i_dstu,
- pixel *dstv, int i_dstv,
- pixel *src, int i_src, int w, int h );
-void x264_plane_copy_deinterleave_sse2( pixel *dstu, int i_dstu,
- pixel *dstv, int i_dstv,
- pixel *src, int i_src, int w, int h );
-void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
- uint8_t *dstv, int i_dstv,
- uint8_t *src, int i_src, int w, int h );
-void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
- uint16_t *dstv, int i_dstv,
- uint16_t *src, int i_src, int w, int h );
-void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
-void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
-void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
-void x264_memzero_aligned_mmx( void * dst, int n );
-void x264_memzero_aligned_sse2( void * dst, int n );
-void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, int stride );
-void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
-void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
-void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
-void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
-void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_mc_copy_w4_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
+void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst,
+ pixel *srcu, intptr_t i_srcu,
+ pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
+ pixel *srcu, intptr_t i_srcu,
+ pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
+ pixel *srcu, intptr_t i_srcu,
+ pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
+ pixel *srcu, intptr_t i_srcu,
+ pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
+ pixel *dstv, intptr_t i_dstv,
+ pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
+ pixel *dstv, intptr_t i_dstv,
+ pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
+ uint8_t *dstv, intptr_t i_dstv,
+ uint8_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint16_t *src, intptr_t i_src, int w, int h );
+void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fenc_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n );
+void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_mmx ( void *dst, size_t n );
+void x264_memzero_aligned_sse2( void *dst, size_t n );
+void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init4v_mmx ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
+void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
-void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
- pixel *src, int i_src,\
+void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
int dx, int dy, int i_width, int i_height );
MC_CHROMA(mmx2)
MC_CHROMA(sse2)
#define LOWRES(cpu)\
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
- int src_stride, int dst_stride, int width, int height );
+ intptr_t src_stride, intptr_t dst_stride, int width, int height );
LOWRES(mmx2)
LOWRES(cache32_mmx2)
LOWRES(sse2)
LOWRES(xop)
#define PIXEL_AVG_W(width,cpu)\
-void x264_pixel_avg2_w##width##_##cpu( pixel *, int, pixel *, int, pixel *, int );
+void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
/* This declares some functions that don't exist, but that isn't a problem. */
#define PIXEL_AVG_WALL(cpu)\
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
PIXEL_AVG_WALL(cache64_ssse3)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
-static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, int, pixel *, int, pixel *, int ) =\
+static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
{\
NULL,\
x264_pixel_avg2_w4_##name1,\
#endif // HIGH_BIT_DEPTH
#define MC_COPY_WTAB(instr, name1, name2, name3)\
-static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int, int ) =\
+static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\
{\
NULL,\
x264_mc_copy_w4_##name1,\
#endif
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
- static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
+ static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
{\
x264_mc_##function##_w4_##name1,\
x264_mc_##function##_w4_##name1,\
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
#define MC_LUMA(name,instr1,instr2)\
-static void mc_luma_##name( pixel *dst, int i_dst_stride,\
- pixel *src[4], int i_src_stride,\
- int mvx, int mvy,\
- int i_width, int i_height, const x264_weight_t *weight )\
+static void mc_luma_##name( pixel *dst, intptr_t i_dst_stride,\
+ pixel *src[4], intptr_t i_src_stride,\
+ int mvx, int mvy,\
+ int i_width, int i_height, const x264_weight_t *weight )\
{\
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
#endif // !HIGH_BIT_DEPTH
#define GET_REF(name)\
-static pixel *get_ref_##name( pixel *dst, int *i_dst_stride,\
- pixel *src[4], int i_src_stride,\
- int mvx, int mvy,\
- int i_width, int i_height, const x264_weight_t *weight )\
+static pixel *get_ref_##name( pixel *dst, intptr_t *i_dst_stride,\
+ pixel *src[4], intptr_t i_src_stride,\
+ int mvx, int mvy,\
+ int i_width, int i_height, const x264_weight_t *weight )\
{\
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
#endif // !HIGH_BIT_DEPTH
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
-void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, int stride, int width);\
-void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, int width );\
-void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, int width );\
+void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\
+void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\
+void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\
static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
- int stride, int width, int height, int16_t *buf )\
+ intptr_t stride, int width, int height, int16_t *buf )\
{\
- int realign = (intptr_t)src & (align-1);\
+ intptr_t realign = (intptr_t)src & (align-1);\
src -= realign;\
dstv -= realign;\
dstc -= realign;\
#else // !HIGH_BIT_DEPTH
HPEL(16, sse2_amd, mmx2, mmx2, sse2)
#if ARCH_X86_64
-void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
-void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
-void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, ssse3, ssse3, ssse3)
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
-static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h )
+static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
{
int c_w = 16/sizeof(pixel) - 1;
if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
}
#define PLANE_INTERLEAVE(cpu) \
-static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
- pixel *srcu, int i_srcu,\
- pixel *srcv, int i_srcv, int w, int h )\
+static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\
+ pixel *srcu, intptr_t i_srcu,\
+ pixel *srcv, intptr_t i_srcv, int w, int h )\
{\
if( !(w&15) ) {\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
push r0
;-----------------------------------------------------------------------------
-; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
+; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal pixel_ssim_4x4x2_core, 0,5
mov r1, r1m
%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
+; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SSD_ONE 2
cglobal pixel_ssd_%1x%2, 4,5,6
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SSD 2
%if %1 != %2
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2,
+; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; The maximum width this function can handle without risk of overflow is given
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2,
+; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
;
; This implementation can potentially overflow on image widths >= 11008 (or
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_var_wxh( uint8_t *, int )
+; int pixel_var_wxh( uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
+; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
;-----------------------------------------------------------------------------
%macro VAR2_8x8_MMX 2
cglobal pixel_var2_8x%1, 5,6
; for small blocks on x86_32, modify pixel pointer instead.
;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_satd_16x4_internal
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
+; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
%if cpuflag(ssse3)
%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
lea r6, [r0+4*r1]
;=============================================================================
;-----------------------------------------------------------------------------
-; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
-; const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
+; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
%if HIGH_BIT_DEPTH
;=============================================================================
%macro ADS_START 0
-%if WIN64
+%if UNIX64
movsxd r5, r5d
+%else
+ mov r5d, r5m
%endif
mov r0d, r5d
lea r6, [r4+r5+15]
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
-cglobal pixel_ads4, 6,7
+cglobal pixel_ads4, 5,7
movq mm6, [r0]
movq mm4, [r0+8]
pshufw mm7, mm6, 0
movd [r6], mm1
ADS_END 1
-cglobal pixel_ads2, 6,7
+cglobal pixel_ads2, 5,7
movq mm6, [r0]
pshufw mm5, r6m, 0
pshufw mm7, mm6, 0
movd [r6], mm4
ADS_END 1
-cglobal pixel_ads1, 6,7
+cglobal pixel_ads1, 5,7
pshufw mm7, [r0], 0
pshufw mm6, r6m, 0
ADS_START
ADS_END 2
%macro ADS_XMM 0
-cglobal pixel_ads4, 6,7,12
+cglobal pixel_ads4, 5,7,12
movdqa xmm4, [r0]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, q2222
%endif ; ARCH
ADS_END 2
-cglobal pixel_ads2, 6,7,8
+cglobal pixel_ads2, 5,7,8
movq xmm6, [r0]
movd xmm5, r6m
pshuflw xmm7, xmm6, 0
movq [r6], xmm1
ADS_END 2
-cglobal pixel_ads1, 6,7,8
+cglobal pixel_ads1, 5,7,8
movd xmm7, [r0]
movd xmm6, r6m
pshuflw xmm7, xmm7, 0
ret x264_pixel_##name##_4x4_##suffix args;\
#define DECL_X1( name, suffix ) \
- DECL_PIXELS( int, name, suffix, ( pixel *, int, pixel *, int ) )
+ DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
- DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, int, int * ) )\
- DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int * ) )
+ DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+ DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
DECL_X1( sad, mmx2 )
DECL_X1( sad, sse2 )
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride ))
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width,
+void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
+ pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width,
+void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
+ pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, int stride1,
- pixel *pixuv2, int stride2, int width,
+void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
+ pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, int stride1,
- const uint8_t *pix2, int stride2, int sums[2][4] );
-void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1,
- const pixel *pix2, int stride2, int sums[2][4] );
-void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, int stride1,
- const pixel *pix2, int stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1,
+ const uint8_t *pix2, intptr_t stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1,
+ const pixel *pix2, intptr_t stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
+ const pixel *pix2, intptr_t stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
-float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
-int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
-int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
-int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
-int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
-int x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * );
-int x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * );
-int x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * );
-int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
-int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
-int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
-int x264_pixel_vsad_ssse3( pixel *src, int stride, int height );
-int x264_pixel_vsad_xop( pixel *src, int stride, int height );
+float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
+int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
+int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
+int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
+int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
+int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
+int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
+int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
+int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
%macro DENOISE_DCT 0
cglobal denoise_dct, 4,4,8
pxor m6, m6
+ movsxdifnidn r3, r3d
.loop:
mova m2, [r0+r3*4-2*mmsize]
mova m3, [r0+r3*4-1*mmsize]
%macro DENOISE_DCT 0
cglobal denoise_dct, 4,4,7
pxor m6, m6
+ movsxdifnidn r3, r3d
.loop:
mova m2, [r0+r3*2-2*mmsize]
mova m3, [r0+r3*2-1*mmsize]
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD 2
cglobal pixel_sad_%1x%2_mmx2, 4,4
%macro SAD_W16 0
;-----------------------------------------------------------------------------
-; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x16, 4,4,8
movu m0, [r2]
SAD_END_SSE2
;-----------------------------------------------------------------------------
-; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x8, 4,4
movu m0, [r2]
RET
;-----------------------------------------------------------------------------
-; void pixel_vsad( pixel *src, int stride );
+; void pixel_vsad( pixel *src, intptr_t stride );
;-----------------------------------------------------------------------------
%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; uint8_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
-%if WIN64
- %assign i %1+1
- movsxd r %+ i, r %+ i %+ d
-%endif
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
;-----------------------------------------------------------------------------
; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-; uint8_t *pix2, int i_stride, int scores[3] )
+; uint8_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 3
cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
-%if WIN64
- %assign i %1+1
- movsxd r %+ i, r %+ i %+ d
-%endif
SAD_X%1_2x%2P_SSE2 1
%rep %3/2-1
SAD_X%1_2x%2P_SSE2 0
%if ARCH_X86_64
PROLOGUE 6,9
mov r8, r6mp
-%if WIN64
- movsxd r5, r5d
-%endif
push r4
push r3
push r2
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD_MMX 3
cglobal pixel_sad_%1x%2, 4,4
%endmacro
;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD_XMM 2
cglobal pixel_sad_%1x%2, 4,4,8
;-----------------------------------------------------------------------------
; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
-; uint16_t *pix2, int i_stride, int scores[3] )
+; uint16_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
%assign regnum %1+1
%xdefine STRIDE r %+ regnum
-%if WIN64
- movsxd STRIDE, STRIDE %+ d
-%endif
mov r6, %3/2-1
SAD_X%1_ONE_START
SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
%define mangle(x) x
%endif
-; FIXME: All of the 64bit asm functions that take a stride as an argument
-; via register, assume that the high dword of that register is filled with 0.
-; This is true in practice (since we never do any 64bit arithmetic on strides,
-; and x264's strides are all positive), but is not guaranteed by the ABI.
-
; Name of the .rodata section.
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
; so use a different read-only section.
ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
pixel *src0, *src1;
- int stride0 = 16, stride1 = 16;
+ intptr_t stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
ALIGNED_4( int16_t mvc[9][2] );
int try_skip = a->b_try_skip;
int y8 = i>>1;
int i_part_cost;
int i_part_cost_bi;
- int stride[2] = {8,8};
+ intptr_t stride[2] = {8,8};
pixel *src[2];
x264_me_t m;
m.i_pixel = PIXEL_8x8;
int y8 = i>>1;
int i_part_cost;
int i_part_cost_bi = 0;
- int stride[2] = {8,8};
+ intptr_t stride[2] = {8,8};
pixel *src[2];
for( int l = 0; l < 2; l++ )
{
int i_part_cost;
int i_part_cost_bi = 0;
- int stride[2] = {16,16};
+ intptr_t stride[2] = {16,16};
pixel *src[2];
x264_me_t m;
m.i_pixel = PIXEL_16x8;
{
int i_part_cost;
int i_part_cost_bi = 0;
- int stride[2] = {8,8};
+ intptr_t stride[2] = {8,8};
pixel *src[2];
x264_me_t m;
m.i_pixel = PIXEL_8x16;
#define COST_MV_HPEL( mx, my ) \
{ \
- int stride2 = 16; \
+ intptr_t stride2 = 16; \
pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
#define COST_MV_SAD( mx, my ) \
{ \
- int stride = 16; \
+ intptr_t stride = 16; \
pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
#define COST_MV_SATD( mx, my, dir ) \
if( b_refine_qpel || (dir^1) != odir ) \
{ \
- int stride = 16; \
+ intptr_t stride = 16; \
pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
{
int omx = bmx, omy = bmy;
int costs[4];
- int stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
+ intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
pixel *src0, *src1, *src2, *src3;
src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
int ref1 = h->mb.cache.ref[1][s8];
const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- int stride[3][2][9];
+ intptr_t stride[3][2][9];
int bm0x = m0->mv[0];
int bm0y = m0->mv[1];
int bm1x = m1->mv[0];
} \
else \
{ \
- int stride1 = 16, stride2 = 16; \
+ intptr_t stride1 = 16, stride2 = 16; \
pixel *src1, *src2; \
src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
(mv0)[0], (mv0)[1], 8, 8, w ); \
;* Copyright (C) 2008-2012 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Henrik Gramner <hengar-6@student.ltu.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
error_message: db "failed to preserve register", 0
-%if WIN64
+%if ARCH_X86_64
; just random numbers to reduce the chance of incidental match
ALIGN 16
x6: ddq 0x79445c159ce790641a1b2550a612b48c
; (max_args % 4) must equal 3 for stack alignment
%define max_args 15
+%if ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; void x264_checkasm_stack_clobber( uint64_t clobber, ... )
+;-----------------------------------------------------------------------------
+cglobal checkasm_stack_clobber, 1,2
+ ; Clobber the stack with junk below the stack pointer
+ %define size (max_args+6)*8
+ SUB rsp, size
+ mov r1, size-8
+.loop:
+ mov [rsp+r1], r0
+ sub r1, 8
+ jge .loop
+ ADD rsp, size
+ RET
+
%if WIN64
+ %assign free_regs 7
+%else
+ %assign free_regs 9
+%endif
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal checkasm_call, 4,15,16
- SUB rsp, max_args*8
+cglobal checkasm_call, 2,15,16
+ SUB rsp, max_args*8+16
mov r6, r0
- mov [rsp+stack_offset+16], r1
- mov r0, r2
- mov r1, r3
- mov r2d, r4m ; FIXME truncates pointer
- mov r3d, r5m ; FIXME truncates pointer
-%assign i 4
-%rep max_args-4
- mov r4, [rsp+stack_offset+8+(i+2)*8]
- mov [rsp+i*8], r4
- %assign i i+1
-%endrep
-%assign i 6
-%rep 16-6
- mova m %+ i, [x %+ i]
- %assign i i+1
-%endrep
-%assign i 7
-%rep 15-7
+ mov [rsp+max_args*8], r1
+
+ ; All arguments have been pushed on the stack instead of registers in order to
+ ; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
+ mov r0, r6mp
+ mov r1, r7mp
+ mov r2, r8mp
+ mov r3, r9mp
+%if UNIX64
+ mov r4, r10mp
+ mov r5, r11mp
+ %assign i 6
+ %rep max_args-6
+ mov r9, [rsp+stack_offset+(i+1)*8]
+ mov [rsp+(i-6)*8], r9
+ %assign i i+1
+ %endrep
+%else
+ %assign i 4
+ %rep max_args-4
+ mov r9, [rsp+stack_offset+(i+7)*8]
+ mov [rsp+i*8], r9
+ %assign i i+1
+ %endrep
+%endif
+
+%if WIN64
+ %assign i 6
+ %rep 16-6
+ mova m %+ i, [x %+ i]
+ %assign i i+1
+ %endrep
+%endif
+
+%assign i 14
+%rep 15-free_regs
mov r %+ i, [n %+ i]
- %assign i i+1
+ %assign i i-1
%endrep
call r6
-%assign i 7
-%rep 15-7
+%assign i 14
+%rep 15-free_regs
xor r %+ i, [n %+ i]
- or r7, r %+ i
- %assign i i+1
-%endrep
-%assign i 6
-%rep 16-6
- pxor m %+ i, [x %+ i]
- por m6, m %+ i
- %assign i i+1
+ or r14, r %+ i
+ %assign i i-1
%endrep
+
+%if WIN64
+ %assign i 6
+ %rep 16-6
+ pxor m %+ i, [x %+ i]
+ por m6, m %+ i
+ %assign i i+1
+ %endrep
packsswb m6, m6
movq r5, m6
- or r7, r5
+ or r14, r5
+%endif
+
jz .ok
- mov r4, rax
+ mov r9, rax
lea r0, [error_message]
call puts
- mov r1, [rsp+stack_offset+16]
+ mov r1, [rsp+max_args*8]
mov dword [r1], 0
- mov rax, r4
+ mov rax, r9
.ok:
- ADD rsp, max_args*8
+ ADD rsp, max_args*8+16
RET
-%elif ARCH_X86_64 == 0
+%else
; just random numbers to reduce the chance of incidental match
%define n3 dword 0x6549315c
#if ARCH_X86 || ARCH_X86_64
int x264_stack_pagealign( int (*func)(), int align );
+
+/* detect when callee-saved regs aren't saved
+ * needs an explicit asm check because it only sometimes crashes in normal use. */
+intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
#else
#define x264_stack_pagealign( func, align ) func()
#endif
#define call_c1(func,...) func(__VA_ARGS__)
-#if ARCH_X86 || defined(_WIN64)
-/* detect when callee-saved regs aren't saved.
- * needs an explicit asm check because it only sometimes crashes in normal use. */
-intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
-#define call_a1(func,...) x264_checkasm_call((intptr_t(*)())func, &ok, __VA_ARGS__)
+#if ARCH_X86_64
+/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
+ * This is done by clobbering the stack with junk around the stack pointer and calling the
+ * assembly function through x264_checkasm_call with added dummy arguments which forces all
+ * real arguments to be passed on the stack and not in registers. For 32-bit argument the
+ * upper half of the 64-bit register location on the stack will now contain junk. Note that
+ * this is dependant on compiler behaviour and that interrupts etc. at the wrong time may
+ * overwrite the junk written to the stack so there's no guarantee that it will always
+ * detect all functions that assumes zero-extension.
+ */
+void x264_checkasm_stack_clobber( uint64_t clobber, ... );
+#define call_a1(func,...) ({ \
+ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
+ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
+ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
+#elif ARCH_X86
+#define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
#else
#define call_a1 call_c1
#endif
used_asm = 1; \
for( int j = 0; j < 64; j++ ) \
{ \
- res_c = call_c( pixel_c.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
- res_asm = call_a( pixel_asm.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
+ res_c = call_c( pixel_c.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
+ res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
for( int j = 0; j < 64; j++ ) \
{ \
pixel *pix2 = pbuf2+j; \
- res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \
+ res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \
res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \
res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \
if( N == 4 ) \
{ \
res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \
- call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
+ call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
} \
else \
- call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
+ call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
ok = 0; \
res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
} \
if( N == 4 ) \
- call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
+ call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
else \
- call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
+ call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
} \
} \
} \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
/* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
- call_c1( pixel_c.var[i], pbuf1, 16 ); \
- call_a1( pixel_asm.var[i], pbuf1, 16 ); \
+ call_c1( pixel_c.var[i], pbuf1, 16 ); \
+ call_a1( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
uint64_t res_c = pixel_c.var[i]( pbuf1, 16 ); \
uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \
if( res_c != res_asm ) \
ok = 0; \
fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
} \
- call_c2( pixel_c.var[i], pbuf1, 16 ); \
- call_a2( pixel_asm.var[i], pbuf1, 16 ); \
+ call_c2( pixel_c.var[i], pbuf1, (intptr_t)16 ); \
+ call_a2( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
}
ok = 1; used_asm = 0;
int res_c, res_asm, ssd_c, ssd_asm; \
set_func_name( "%s_%s", "var2", pixel_names[i] ); \
used_asm = 1; \
- res_c = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \
- res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \
+ res_c = call_c( pixel_c.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c ); \
+ res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \
if( res_c != res_asm || ssd_c != ssd_asm ) \
{ \
ok = 0; \
for( int j = 0; j < 32; j++ )
{
pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256;
- call_c1( pixel_c.hadamard_ac[i], pbuf1, 16 );
- call_a1( pixel_asm.hadamard_ac[i], pbuf1, 16 );
+ call_c1( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 );
+ call_a1( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
if( rc != ra )
break;
}
}
- call_c2( pixel_c.hadamard_ac[i], pbuf1, 16 );
- call_a2( pixel_asm.hadamard_ac[i], pbuf1, 16 );
+ call_c2( pixel_c.hadamard_ac[i], pbuf1, (intptr_t)16 );
+ call_a2( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
}
report( "pixel hadamard_ac :" );
for( int j = 0; j < 2 && ok; j++ )
{
pixel *p = j ? pbuf4 : pbuf1;
- res_c = call_c( pixel_c.vsad, p, 16, h );
- res_asm = call_a( pixel_asm.vsad, p, 16, h );
+ res_c = call_c( pixel_c.vsad, p, (intptr_t)16, h );
+ res_asm = call_a( pixel_asm.vsad, p, (intptr_t)16, h );
if( res_c != res_asm )
{
ok = 0;
fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n",
res_u_c, res_v_c, res_u_a, res_v_a );
}
- call_c( pixel_c.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_c, &res_v_c );
- call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_a, &res_v_a );
+ call_c( pixel_c.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_c, &res_v_c );
+ call_a( pixel_asm.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_a, &res_v_a );
}
report( "ssd_nv12 :" );
fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
}
set_func_name( "ssim_core" );
- call_c2( pixel_c.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums );
- call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums );
+ call_c2( pixel_c.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
+ call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
set_func_name( "ssim_end" );
call_c2( pixel_c.ssim_end4, sums, sums, 4 );
call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
used_asm = 1; \
for( int i = 0; i < 1024; i++ ) \
pbuf3[i] = pbuf4[i] = 0xCD; \
- call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
- call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
+ call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
+ call_a( mc_a.mc_luma, dst2, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
{ \
fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
if( mc_a.get_ref != mc_ref.get_ref ) \
{ \
pixel *ref = dst2; \
- int ref_stride = 32; \
+ intptr_t ref_stride = 32; \
int w_checked = ( ( sizeof(pixel) == 2 && (w == 12 || w == 20)) ? w-2 : w ); \
const x264_weight_t *weight = x264_weight_none; \
set_func_name( "get_ref_%dx%d", w_checked, h ); \
used_asm = 1; \
for( int i = 0; i < 1024; i++ ) \
pbuf3[i] = pbuf4[i] = 0xCD; \
- call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
- ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
+ call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
+ ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, (intptr_t)64, dx, dy, w, h, weight ); \
for( int i = 0; i < h; i++ ) \
if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * sizeof(pixel) ) ) \
{ \
used_asm = 1; \
for( int i = 0; i < 1024; i++ ) \
pbuf3[i] = pbuf4[i] = 0xCD; \
- call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \
- call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \
+ call_c( mc_c.mc_chroma, dst1, dst1+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
+ call_a( mc_a.mc_chroma, dst2, dst2+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
/* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
for( int j = 0; j < h; j++ ) \
for( int i = w; i < 8; i++ ) \
{ \
dst2[i+j*16+8] = dst1[i+j*16+8]; \
- dst2[i+j*16] = dst1[i+j*16]; \
+ dst2[i+j*16 ] = dst1[i+j*16 ]; \
} \
if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
{ \
{ \
set_func_name( "%s_%s", #name, pixel_names[i] ); \
used_asm = 1; \
- call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
- call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
+ call_c1( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
+ call_a1( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
} \
- call_c2( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
- call_a2( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
+ call_c2( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
+ call_a2( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
} \
} \
}
{ \
set_func_name( "%s_w%d", #name, j ); \
used_asm = 1; \
- call_c1( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
+ call_c1( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
mc_a.weight_cache(&ha, &weight); \
- call_a1( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
+ call_a1( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
for( int k = 0; k < 16; k++ ) \
if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \
{ \
fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
break; \
} \
- call_c2( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
- call_a2( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
+ call_c2( mc_c.weight[i], buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+ call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
} \
}
used_asm = 1;
memset( pbuf3, 0, 64*height );
memset( pbuf4, 0, 64*height );
- call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height );
- call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height );
+ call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height );
+ call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height );
if( memcmp( pbuf3, pbuf4, 64*height ) )
{
ok = 0;
{
set_func_name( "load_deinterleave_chroma_fenc" );
used_asm = 1;
- call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height );
- call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height );
+ call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, (intptr_t)64, height );
+ call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, (intptr_t)64, height );
if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
{
ok = 0;
{
set_func_name( "load_deinterleave_chroma_fdec" );
used_asm = 1;
- call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height );
- call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height );
+ call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, (intptr_t)64, height );
+ call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, (intptr_t)64, height );
if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
{
ok = 0;
{
int w = plane_specs[i].w;
int h = plane_specs[i].h;
- int src_stride = plane_specs[i].src_stride;
- int dst_stride = (w + 127) & ~63;
+ intptr_t src_stride = plane_specs[i].src_stride;
+ intptr_t dst_stride = (w + 127) & ~63;
assert( dst_stride * h <= 0x1000 );
pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
memset( pbuf3, 0, 0x1000*sizeof(pixel) );
if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) )
{
ok = 0;
- fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+ fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
break;
}
}
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
- int src_stride = (plane_specs[i].src_stride + 1) >> 1;
- int dst_stride = (2*w + 127) & ~63;
+ intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
+ intptr_t dst_stride = (2*w + 127) & ~63;
assert( dst_stride * h <= 0x1000 );
pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
memset( pbuf3, 0, 0x1000*sizeof(pixel) );
if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
{
ok = 0;
- fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+ fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
break;
}
}
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
- int dst_stride = w;
- int src_stride = (2*w + 127) & ~63;
- int offv = (dst_stride*h + 31) & ~15;
+ intptr_t dst_stride = w;
+ intptr_t src_stride = (2*w + 127) & ~63;
+ intptr_t offv = (dst_stride*h + 31) & ~15;
memset( pbuf3, 0, 0x1000 );
memset( pbuf4, 0, 0x1000 );
call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );
memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) )
{
ok = 0;
- fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+ fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
break;
}
}
ok = 1; used_asm = 1;
memset( pbuf3, 0, 4096 * sizeof(pixel) );
memset( pbuf4, 0, 4096 * sizeof(pixel) );
- call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp );
- call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp );
+ call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, (intptr_t)64, 48, 10, tmp );
+ call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, (intptr_t)64, 48, 10, tmp );
for( int i = 0; i < 3; i++ )
for( int j = 0; j < 10; j++ )
//FIXME ideally the first pixels would match too, but they aren't actually used
ok = 1; used_asm = 1;
for( int w = 40; w <= 48; w += 8 )
{
- int stride = (w+8)&~15;
- call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
- call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+ intptr_t stride = (w+8)&~15;
+ call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], (intptr_t)w*2, stride, w, 16 );
+ call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], (intptr_t)w*2, stride, w, 16 );
for( int i = 0; i < 16; i++ )
{
for( int j = 0; j < 4; j++ )
#define INTEGRAL_INIT( name, size, ... )\
if( mc_a.name != mc_ref.name )\
{\
- int stride = 80;\
+ intptr_t stride = 80;\
set_func_name( #name );\
used_asm = 1;\
memcpy( buf3, buf1, size*2*stride );\
{
set_func_name( "memcpy_aligned" );
ok = 1; used_asm = 1;
- for( int size = 16; size < 256; size += 16 )
+ for( size_t size = 16; size < 256; size += 16 )
{
memset( buf4, 0xAA, size + 1 );
call_c( mc_c.memcpy_aligned, buf3, buf1, size );
if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
{
ok = 0;
- fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size );
+ fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
break;
}
}
{
set_func_name( "memzero_aligned" );
ok = 1; used_asm = 1;
- for( int size = 128; size < 1024; size += 128 )
+ for( size_t size = 128; size < 1024; size += 128 )
{
memset( buf4, 0xAA, size + 1 );
call_c( mc_c.memzero_aligned, buf3, size );
if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
{
ok = 0;
- fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size );
+ fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
break;
}
}
#define TEST_DEBLOCK( name, align, ... ) \
for( int i = 0; i < 36; i++ ) \
{ \
- int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
+ intptr_t off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
for( int j = 0; j < 1024; j++ ) \
/* two distributions of random to excersize different failure modes */ \
pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
{ \
set_func_name( #name ); \
used_asm = 1; \
- call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
- call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ call_c1( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ call_a1( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
break; \
} \
- call_c2( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
- call_a2( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ call_c2( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ call_a2( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
} \
}
memcpy( dct1, buf1, size*sizeof(dctcoef) );
memcpy( dct2, buf1, size*sizeof(dctcoef) );
memcpy( buf3+256, buf3, 256 );
- call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
+ call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
ok = 0;
- call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
+ call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
}
}