From: Henrik Gramner <hengar-6@student.ltu.se>
Date: Wed, 1 Feb 2012 22:52:48 +0000 (+0100)
Subject: Fix incorrect zero-extension assumptions in x86_64 asm
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3131a19cabcdca221ce4cd61a3cff68d99f1a517;p=libx264

Fix incorrect zero-extension assumptions in x86_64 asm
Some x264 asm assumed that the high 32 bits of registers containing "int" values would be zero.
This is almost always the case, and it seems to work with gcc, but it is *not* guaranteed by the ABI.
As a result, it breaks with some other compilers, like Clang, that take advantage of this in optimizations.
Accordingly, fix all x86 code by using intptr_t instead of int or using movsxd where neccessary.
Also add checkasm hack to detect when assembly functions incorrectly assumes that 32-bit integers are zero-extended to 64-bit.
---

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 8ed0a227..507bbba1 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -32,7 +32,7 @@
 // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
 // They also use nothing above armv5te, but we don't care about pre-armv6
 
-// void prefetch_ref( uint8_t *pix, int stride, int parity )
+// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
 function x264_prefetch_ref_arm
     sub         r2, r2, #1
     add         r0, r0, #64
@@ -51,8 +51,8 @@ function x264_prefetch_ref_arm
     bx          lr
 .endfunc
 
-// void prefetch_fenc( uint8_t *pix_y, int stride_y,
-//                     uint8_t *pix_uv, int stride_uv, int mb_x )
+// void prefetch_fenc( uint8_t *pix_y,  intptr_t stride_y,
+//                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
 function x264_prefetch_fenc_arm
     ldr         ip, [sp]
     push        {lr}
@@ -78,7 +78,7 @@ function x264_prefetch_fenc_arm
 .endfunc
 
 
-// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
+// void *x264_memcpy_aligned( void *dst, const void *src, size_t n )
 function x264_memcpy_aligned_neon
     orr         r3,  r0,  r1,  lsr #1
     movrel      ip,  memcpy_table
@@ -158,9 +158,9 @@ memzero_loop:
 .endfunc
 
 
-// void pixel_avg( uint8_t *dst, int dst_stride,
-//                 uint8_t *src1, int src1_stride,
-//                 uint8_t *src2, int src2_stride, int weight );
+// void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
+//                 uint8_t *src1, intptr_t src1_stride,
+//                 uint8_t *src2, intptr_t src2_stride, int weight );
 .macro AVGH w h
 function x264_pixel_avg_\w\()x\h\()_neon
     ldr         ip, [sp, #8]
@@ -455,7 +455,7 @@ avg2_w20_loop:
 .endif
 .endm
 
-// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride,
 //                 const x264_weight_t *weight, int height )
 function x264_mc_weight_w20_neon
     weight_prologue full
@@ -744,7 +744,7 @@ weight_simple offsetadd, vqadd.u8
 weight_simple offsetsub, vqsub.u8
 
 
-// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
+// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
 function x264_mc_copy_w4_neon
     ldr         ip,  [sp]
 copy_w4_loop:
@@ -810,8 +810,8 @@ copy_w16_aligned_loop:
 .endfunc
 
 
-// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
-//                           uint8_t *src, int i_src_stride,
+// void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride,
+//                           uint8_t *src, intptr_t i_src_stride,
 //                           int dx, int dy, int i_width, int i_height );
 function x264_mc_chroma_neon
     push            {r4-r6, lr}
@@ -1052,7 +1052,7 @@ mc_chroma_w8:
 .endfunc
 
 
-// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
+// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width )
 function x264_hpel_filter_v_neon
     ldr             ip,  [sp]
     sub             r1,  r1,  r3,  lsl #1
@@ -1266,7 +1266,7 @@ filter_h_loop:
 
 
 // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
-//                         uint8_t *dstc, int src_stride, int dst_stride, int width,
+//                         uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width,
 //                         int height )
 function x264_frame_init_lowres_core_neon
     push            {r4-r10,lr}
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index de4aec4e..1148ae76 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -26,33 +26,33 @@
 #include "common/common.h"
 #include "mc.h"
 
-void x264_prefetch_ref_arm( uint8_t *, int, int );
-void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
-
-void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
-void x264_memzero_aligned_neon( void *dst, int n );
-
-void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
-
-void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_neon( void *dst, size_t n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
 #define MC_WEIGHT(func)\
-void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
-void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
-void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
-void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
+void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 \
-static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
 {\
     x264_mc_weight_w4##func##_neon,\
     x264_mc_weight_w4##func##_neon,\
@@ -67,15 +67,15 @@ MC_WEIGHT(_nodenom)
 MC_WEIGHT(_offsetadd)
 MC_WEIGHT(_offsetsub)
 
-void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 
-void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
-void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 
-void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
 void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
 void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
 
@@ -101,7 +101,7 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
         w->weightfn = x264_mc_wtab_neon;
 }
 
-static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
 {
     NULL,
     x264_pixel_avg2_w4_neon,
@@ -111,7 +111,7 @@ static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, in
     x264_pixel_avg2_w20_neon,
 };
 
-static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) =
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
 {
     NULL,
     x264_mc_copy_w4_neon,
@@ -123,13 +123,13 @@ static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int,
 static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
 
-static void mc_luma_neon( uint8_t *dst,    int i_dst_stride,
-                          uint8_t *src[4], int i_src_stride,
+static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
+                          uint8_t *src[4], intptr_t i_src_stride,
                           int mvx, int mvy,
                           int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
@@ -149,13 +149,13 @@ static void mc_luma_neon( uint8_t *dst,    int i_dst_stride,
         x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
 }
 
-static uint8_t *get_ref_neon( uint8_t *dst,   int *i_dst_stride,
-                              uint8_t *src[4], int i_src_stride,
+static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
+                              uint8_t *src[4], intptr_t i_src_stride,
                               int mvx, int mvy,
                               int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
@@ -183,9 +183,9 @@ static uint8_t *get_ref_neon( uint8_t *dst,   int *i_dst_stride,
 }
 
 static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
-                              int stride, int width, int height, int16_t *buf )
+                              intptr_t stride, int width, int height, int16_t *buf )
 {
-    int realign = (intptr_t)src & 15;
+    intptr_t realign = (intptr_t)src & 15;
     src -= realign;
     dstv -= realign;
     dstc -= realign;
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index d0c90dae..ba390112 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -39,11 +39,11 @@
     DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
 
 #define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )
+    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
 
-int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
 DECL_X1( sad, neon )
 DECL_X1( sad_aligned, neon )
@@ -52,21 +52,21 @@ DECL_X4( sad, neon )
 DECL_X1( satd, neon )
 DECL_X1( ssd, neon )
 
-int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
-uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
-uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
-int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
+uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
+int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 
-uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
-uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int );
-uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int );
-uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int );
+uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
 
-void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
-                                      const uint8_t *, int,
-                                      int sums[2][4]);
+void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
+                                      const uint8_t *, intptr_t,
+                                      int sums[2][4] );
 float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
 
 #endif
diff --git a/common/deblock.c b/common/deblock.c
index 922b076d..51f0d7a8 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -76,7 +76,7 @@ static const int8_t i_tc0_table[52+12*3][4] =
 #define tc0_table(x)   i_tc0_table[(x)+24]
 
 /* From ffmpeg */
-static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc0 )
+static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 )
 {
     int p2 = pix[-3*xstride];
     int p1 = pix[-2*xstride];
@@ -107,7 +107,7 @@ static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alph
         pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
     }
 }
-static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
 {
     for( int i = 0; i < 4; i++ )
     {
@@ -120,21 +120,21 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp
             deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
     }
 }
-static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     for( int d = 0; d < 8; d++, pix += stride )
         deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
 }
-static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
 }
-static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
 }
 
-static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc )
+static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc )
 {
     int p1 = pix[-2*xstride];
     int p0 = pix[-1*xstride];
@@ -148,7 +148,7 @@ static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int al
         pix[ 0*xstride] = x264_clip_pixel( q0 - delta );    /* q0' */
     }
 }
-static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
 {
     for( int i = 0; i < 4; i++ )
     {
@@ -163,24 +163,24 @@ static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride,
                 deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
     }
 }
-static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
 }
-static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
 }
-static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
 }
-static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
 }
 
-static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta )
+static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
 {
     int p2 = pix[-3*xstride];
     int p1 = pix[-2*xstride];
@@ -219,26 +219,26 @@ static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, in
         }
     }
 }
-static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
+static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta )
 {
     for( int d = 0; d < 16; d++, pix += ystride )
         deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
 }
-static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
+static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta )
 {
     for( int d = 0; d < 8; d++, pix += ystride )
         deblock_edge_luma_intra_c( pix, 1, alpha, beta );
 }
-static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
     deblock_luma_intra_c( pix, stride, 1, alpha, beta );
 }
-static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
     deblock_luma_intra_c( pix, 1, stride, alpha, beta );
 }
 
-static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, int alpha, int beta )
+static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
 {
     int p1 = pix[-2*xstride];
     int p0 = pix[-1*xstride];
@@ -251,25 +251,25 @@ static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride,
         pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2;   /* q0' */
     }
 }
-static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta )
+static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta )
 {
     for( int d = 0; d < height; d++, pix += ystride-2 )
         for( int e = 0; e < width; e++, pix++ )
             deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
 }
-static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
     deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
 }
-static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
     deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
 }
-static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
     deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
 }
-static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
 {
     deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
 }
@@ -303,7 +303,8 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
     }
 }
 
-static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
+static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
+                                        int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
 {
     int index_a = i_qp + a;
     int index_b = i_qp + b;
@@ -322,7 +323,8 @@ static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uin
     pf_inter( pix, i_stride, alpha, beta, tc );
 }
 
-static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
+static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
+                                              int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
 {
     int index_a = i_qp + a;
     int index_b = i_qp + b;
@@ -631,30 +633,30 @@ void x264_macroblock_deblock( x264_t *h )
 }
 
 #if HAVE_MMX
-void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
 void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                   int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                   int mvy_limit, int bframe );
@@ -668,32 +670,32 @@ void x264_deblock_strength_avx  ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X2
                                   int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                   int mvy_limit, int bframe );
 
-void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
 #if ARCH_X86
-void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
-void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
 
 #if HIGH_BIT_DEPTH
-void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
+void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
 #else
 // FIXME this wrapper has a significant cpu cost
-static void x264_deblock_v_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     x264_deblock_v8_luma_mmx2( pix,   stride, alpha, beta, tc0   );
     x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
 }
-static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta )
+static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta )
 {
     x264_deblock_v8_luma_intra_mmx2( pix,   stride, alpha, beta );
     x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
@@ -703,15 +705,15 @@ static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha,
 #endif
 
 #if ARCH_PPC
-void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #endif // ARCH_PPC
 
 #if HAVE_ARMV6
-void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
-void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
-void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
-void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
+void x264_deblock_v_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 #endif
 
 void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
diff --git a/common/frame.c b/common/frame.c
index e13e5097..21d13476 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -710,8 +710,8 @@ x264_frame_t *x264_frame_pop_blank_unused( x264_t *h )
     return frame;
 }
 
-void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
-                         int i_width, int i_height, x264_weight_t *w )
+void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
+                              int i_width, int i_height, x264_weight_t *w )
 {
     /* Weight horizontal strips of height 16. This was found to be the optimal height
      * in terms of the cache loads. */
diff --git a/common/frame.h b/common/frame.h
index 94e875d2..54415f7f 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -178,8 +178,8 @@ typedef struct
    x264_pthread_cond_t      cv_empty; /* event signaling that the list became emptier */
 } x264_sync_frame_list_t;
 
-typedef void (*x264_deblock_inter_t)( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
-typedef void (*x264_deblock_intra_t)( pixel *pix, int stride, int alpha, int beta );
+typedef void (*x264_deblock_inter_t)( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+typedef void (*x264_deblock_intra_t)( pixel *pix, intptr_t stride, int alpha, int beta );
 typedef struct
 {
     x264_deblock_inter_t deblock_luma[2];
@@ -232,7 +232,7 @@ x264_frame_t *x264_frame_shift( x264_frame_t **list );
 void          x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
 void          x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
 x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
-void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
+void x264_weight_scale_plane( x264_t *h, pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
                               int i_width, int i_height, x264_weight_t *w );
 x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
 void          x264_frame_delete_list( x264_frame_t **list );
diff --git a/common/macroblock.c b/common/macroblock.c
index d600f82d..b4e6d951 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -121,7 +121,7 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
     int mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
     int mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
     int i_mode = x264_size2pixel[height][width];
-    int i_stride0 = 16, i_stride1 = 16;
+    intptr_t i_stride0 = 16, i_stride1 = 16;
     ALIGNED_ARRAY_16( pixel, tmp0,[16*16] );
     ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
     pixel *src0, *src1;
diff --git a/common/mc.c b/common/mc.c
index 88ed6ea8..86f7e35a 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -37,10 +37,9 @@
 #endif
 
 
-static inline void pixel_avg( pixel *dst,  int i_dst_stride,
-                              pixel *src1, int i_src1_stride,
-                              pixel *src2, int i_src2_stride,
-                              int i_width, int i_height )
+static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
+                              pixel *src1, intptr_t i_src1_stride,
+                              pixel *src2, intptr_t i_src2_stride, int i_width, int i_height )
 {
     for( int y = 0; y < i_height; y++ )
     {
@@ -52,7 +51,9 @@ static inline void pixel_avg( pixel *dst,  int i_dst_stride,
     }
 }
 
-static inline void pixel_avg_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height )
+static inline void pixel_avg_wxh( pixel *dst,  intptr_t i_dst,
+                                  pixel *src1, intptr_t i_src1,
+                                  pixel *src2, intptr_t i_src2, int width, int height )
 {
     for( int y = 0; y < height; y++ )
     {
@@ -66,9 +67,11 @@ static inline void pixel_avg_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1
 
 /* Implicit weighted bipred only:
  * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
-static inline void pixel_avg_weight_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height, int i_weight1 )
+static inline void pixel_avg_weight_wxh( pixel *dst,  intptr_t i_dst,
+                                         pixel *src1, intptr_t i_src1,
+                                         pixel *src2, intptr_t i_src2, int width, int height, int i_weight1 )
 {
-    const int i_weight2 = 64 - i_weight1;
+    int i_weight2 = 64 - i_weight1;
     for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
         for( int x = 0; x<width; x++ )
             dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 );
@@ -76,9 +79,9 @@ static inline void pixel_avg_weight_wxh( pixel *dst, int i_dst, pixel *src1, int
 #undef op_scale2
 
 #define PIXEL_AVG_C( name, width, height ) \
-static void name( pixel *pix1, int i_stride_pix1, \
-                  pixel *pix2, int i_stride_pix2, \
-                  pixel *pix3, int i_stride_pix3, int weight ) \
+static void name( pixel *pix1, intptr_t i_stride_pix1, \
+                  pixel *pix2, intptr_t i_stride_pix2, \
+                  pixel *pix3, intptr_t i_stride_pix3, int weight ) \
 { \
     if( weight == 32 ) \
         pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
@@ -104,7 +107,8 @@ static void x264_weight_cache( x264_t *h, x264_weight_t *w )
 }
 #define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
 #define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
-static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
+                       const x264_weight_t *weight, int i_width, int i_height )
 {
     int offset = weight->i_offset << (BIT_DEPTH-8);
     int scale = weight->i_scale;
@@ -124,7 +128,7 @@ static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_strid
 }
 
 #define MC_WEIGHT_C( name, width ) \
-    static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \
+    static void name( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, const x264_weight_t *weight, int height ) \
 { \
     mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
 }
@@ -146,7 +150,7 @@ static weight_fn_t x264_mc_weight_wtab[6] =
     mc_weight_w20,
 };
 const x264_weight_t x264_weight_none[3] = { {{0}} };
-static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, int i_width, int i_height )
+static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
 {
     for( int y = 0; y < i_height; y++ )
     {
@@ -159,7 +163,7 @@ static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride,
 
 #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
 static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         int stride, int width, int height, int16_t *buf )
+                         intptr_t stride, int width, int height, int16_t *buf )
 {
     const int pad = (BIT_DEPTH > 9) ? (-10 * PIXEL_MAX) : 0;
     for( int y = 0; y < height; y++ )
@@ -185,8 +189,8 @@ static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
 static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
 
-static void mc_luma( pixel *dst,    int i_dst_stride,
-                     pixel *src[4], int i_src_stride,
+static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
+                     pixel *src[4], intptr_t i_src_stride,
                      int mvx, int mvy,
                      int i_width, int i_height, const x264_weight_t *weight )
 {
@@ -208,8 +212,8 @@ static void mc_luma( pixel *dst,    int i_dst_stride,
         mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
 }
 
-static pixel *get_ref( pixel *dst,   int *i_dst_stride,
-                       pixel *src[4], int i_src_stride,
+static pixel *get_ref( pixel *dst,   intptr_t *i_dst_stride,
+                       pixel *src[4], intptr_t i_src_stride,
                        int mvx, int mvy,
                        int i_width, int i_height, const x264_weight_t *weight )
 {
@@ -239,8 +243,8 @@ static pixel *get_ref( pixel *dst,   int *i_dst_stride,
 }
 
 /* full chroma mc (ie until 1/8 pixel)*/
-static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride,
-                       pixel *src, int i_src_stride,
+static void mc_chroma( pixel *dstu, pixel *dstv, intptr_t i_dst_stride,
+                       pixel *src, intptr_t i_src_stride,
                        int mvx, int mvy,
                        int i_width, int i_height )
 {
@@ -273,7 +277,7 @@ static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride,
 }
 
 #define MC_COPY(W) \
-static void mc_copy_w##W( pixel *dst, int i_dst, pixel *src, int i_src, int i_height ) \
+static void mc_copy_w##W( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int i_height ) \
 { \
     mc_copy( src, i_src, dst, i_dst, W, i_height ); \
 }
@@ -281,8 +285,8 @@ MC_COPY( 16 )
 MC_COPY( 8 )
 MC_COPY( 4 )
 
-void x264_plane_copy_c( pixel *dst, int i_dst,
-                        pixel *src, int i_src, int w, int h )
+void x264_plane_copy_c( pixel *dst, intptr_t i_dst,
+                        pixel *src, intptr_t i_src, int w, int h )
 {
     while( h-- )
     {
@@ -292,9 +296,9 @@ void x264_plane_copy_c( pixel *dst, int i_dst,
     }
 }
 
-void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
-                                   pixel *srcu, int i_srcu,
-                                   pixel *srcv, int i_srcv, int w, int h )
+void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
+                                   pixel *srcu, intptr_t i_srcu,
+                                   pixel *srcv, intptr_t i_srcv, int w, int h )
 {
     for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
         for( int x=0; x<w; x++ )
@@ -304,9 +308,9 @@ void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
         }
 }
 
-static void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
-                                            pixel *dstv, int i_dstv,
-                                            pixel *src, int i_src, int w, int h )
+static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
+                                            pixel *dstv, intptr_t i_dstv,
+                                            pixel *src,  intptr_t i_src, int w, int h )
 {
     for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
         for( int x=0; x<w; x++ )
@@ -316,10 +320,10 @@ static void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
         }
 }
 
-static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
-                                                pixel *dstb, int i_dstb,
-                                                pixel *dstc, int i_dstc,
-                                                pixel *src, int i_src, int pw, int w, int h )
+static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
+                                                pixel *dstb, intptr_t i_dstb,
+                                                pixel *dstc, intptr_t i_dstc,
+                                                pixel *src,  intptr_t i_src, int pw, int w, int h )
 {
     for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, dstc+=i_dstc, src+=i_src )
     {
@@ -332,7 +336,7 @@ static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, int i_dsta,
     }
 }
 
-static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height )
+static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
 {
     for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
         for( int x=0; x<8; x++ )
@@ -342,29 +346,29 @@ static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *
         }
 }
 
-static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
+static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
 {
     x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
 }
 
-static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
+static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
 {
     x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
 }
 
-static void prefetch_fenc_null( pixel *pix_y, int stride_y,
-                                pixel *pix_uv, int stride_uv, int mb_x )
+static void prefetch_fenc_null( pixel *pix_y,  intptr_t stride_y,
+                                pixel *pix_uv, intptr_t stride_uv, int mb_x )
 {}
 
-static void prefetch_ref_null( pixel *pix, int stride, int parity )
+static void prefetch_ref_null( pixel *pix, intptr_t stride, int parity )
 {}
 
-static void memzero_aligned( void * dst, int n )
+static void memzero_aligned( void * dst, size_t n )
 {
     memset( dst, 0, n );
 }
 
-static void integral_init4h( uint16_t *sum, pixel *pix, int stride )
+static void integral_init4h( uint16_t *sum, pixel *pix, intptr_t stride )
 {
     int v = pix[0]+pix[1]+pix[2]+pix[3];
     for( int x = 0; x < stride-4; x++ )
@@ -374,7 +378,7 @@ static void integral_init4h( uint16_t *sum, pixel *pix, int stride )
     }
 }
 
-static void integral_init8h( uint16_t *sum, pixel *pix, int stride )
+static void integral_init8h( uint16_t *sum, pixel *pix, intptr_t stride )
 {
     int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
     for( int x = 0; x < stride-8; x++ )
@@ -384,7 +388,7 @@ static void integral_init8h( uint16_t *sum, pixel *pix, int stride )
     }
 }
 
-static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+static void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
 {
     for( int x = 0; x < stride-8; x++ )
         sum4[x] = sum8[x+4*stride] - sum8[x];
@@ -392,7 +396,7 @@ static void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
         sum8[x] = sum8[x+8*stride] + sum8[x+8*stride+4] - sum8[x] - sum8[x+4];
 }
 
-static void integral_init8v( uint16_t *sum8, int stride )
+static void integral_init8v( uint16_t *sum8, intptr_t stride )
 {
     for( int x = 0; x < stride-8; x++ )
         sum8[x] = sum8[x+8*stride] - sum8[x];
@@ -425,7 +429,7 @@ void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
 }
 
 static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
-                                    int src_stride, int dst_stride, int width, int height )
+                                    intptr_t src_stride, intptr_t dst_stride, int width, int height )
 {
     for( int y = 0; y < height; y++ )
     {
diff --git a/common/mc.h b/common/mc.h
index 424bcee3..8d0f7727 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -27,7 +27,7 @@
 #define X264_MC_H
 
 struct x264_weight_t;
-typedef void (* weight_fn_t)( pixel *, int, pixel *,int, const struct x264_weight_t *, int );
+typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int );
 typedef struct x264_weight_t
 {
     /* aligning the first member is a gcc hack to force the struct to be
@@ -62,65 +62,58 @@ extern const x264_weight_t x264_weight_none[3];
 
 typedef struct
 {
-    void (*mc_luma)( pixel *dst, int i_dst, pixel **src, int i_src,
+    void (*mc_luma)( pixel *dst, intptr_t i_dst, pixel **src, intptr_t i_src,
                      int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
 
     /* may round up the dimensions if they're not a power of 2 */
-    pixel* (*get_ref)( pixel *dst, int *i_dst, pixel **src, int i_src,
+    pixel* (*get_ref)( pixel *dst, intptr_t *i_dst, pixel **src, intptr_t i_src,
                        int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
 
     /* mc_chroma may write up to 2 bytes of garbage to the right of dst,
      * so it must be run from left to right. */
-    void (*mc_chroma)( pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
+    void (*mc_chroma)( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,
                        int mvx, int mvy, int i_width, int i_height );
 
-    void (*avg[12])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
+    void (*avg[12])( pixel *dst,  intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+                     pixel *src2, intptr_t src2_stride, int i_weight );
 
     /* only 16x16, 8x8, and 4x4 defined */
-    void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
-    void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );
-
-    void (*store_interleave_chroma)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
-    void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, int i_src, int height );
-    void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, int i_src, int height );
-
-    void (*plane_copy)( pixel *dst, int i_dst,
-                        pixel *src, int i_src, int w, int h );
-    void (*plane_copy_interleave)( pixel *dst, int i_dst,
-                                   pixel *srcu, int i_srcu,
-                                   pixel *srcv, int i_srcv, int w, int h );
+    void (*copy[7])( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
+    void (*copy_16x16_unaligned)( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int i_height );
+
+    void (*store_interleave_chroma)( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+    void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, intptr_t i_src, int height );
+    void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
+
+    void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
+    void (*plane_copy_interleave)( pixel *dst,  intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
+                                   pixel *srcv, intptr_t i_srcv, int w, int h );
     /* may write up to 15 pixels off the end of each plane */
-    void (*plane_copy_deinterleave)( pixel *dstu, int i_dstu,
-                                     pixel *dstv, int i_dstv,
-                                     pixel *src, int i_src, int w, int h );
-    void (*plane_copy_deinterleave_rgb)( pixel *dsta, int i_dsta,
-                                         pixel *dstb, int i_dstb,
-                                         pixel *dstc, int i_dstc,
-                                         pixel *src, int i_src, int pw, int w, int h );
+    void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
+                                     pixel *src,  intptr_t i_src, int w, int h );
+    void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
+                                         pixel *dstc, intptr_t i_dstc, pixel *src,  intptr_t i_src, int pw, int w, int h );
     void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         int i_stride, int i_width, int i_height, int16_t *buf );
+                         intptr_t i_stride, int i_width, int i_height, int16_t *buf );
 
     /* prefetch the next few macroblocks of fenc or fdec */
-    void (*prefetch_fenc)( pixel *pix_y, int stride_y,
-                           pixel *pix_uv, int stride_uv, int mb_x );
-    void (*prefetch_fenc_420)( pixel *pix_y, int stride_y,
-                               pixel *pix_uv, int stride_uv, int mb_x );
-    void (*prefetch_fenc_422)( pixel *pix_y, int stride_y,
-                               pixel *pix_uv, int stride_uv, int mb_x );
+    void (*prefetch_fenc)    ( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
+    void (*prefetch_fenc_420)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
+    void (*prefetch_fenc_422)( pixel *pix_y, intptr_t stride_y, pixel *pix_uv, intptr_t stride_uv, int mb_x );
     /* prefetch the next few macroblocks of a hpel reference frame */
-    void (*prefetch_ref)( pixel *pix, int stride, int parity );
+    void (*prefetch_ref)( pixel *pix, intptr_t stride, int parity );
 
     void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
-    void (*memzero_aligned)( void *dst, int n );
+    void (*memzero_aligned)( void *dst, size_t n );
 
     /* successive elimination prefilter */
-    void (*integral_init4h)( uint16_t *sum, pixel *pix, int stride );
-    void (*integral_init8h)( uint16_t *sum, pixel *pix, int stride );
-    void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
-    void (*integral_init8v)( uint16_t *sum8, int stride );
+    void (*integral_init4h)( uint16_t *sum, pixel *pix, intptr_t stride );
+    void (*integral_init8h)( uint16_t *sum, pixel *pix, intptr_t stride );
+    void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+    void (*integral_init8v)( uint16_t *sum8, intptr_t stride );
 
     void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
-                                    int src_stride, int dst_stride, int width, int height );
+                                    intptr_t src_stride, intptr_t dst_stride, int width, int height );
     weight_fn_t *weight;
     weight_fn_t *offsetadd;
     weight_fn_t *offsetsub;
diff --git a/common/pixel.c b/common/pixel.c
index 2946ddcf..03425b56 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -46,8 +46,8 @@
  * pixel_sad_WxH
  ****************************************************************************/
 #define PIXEL_SAD_C( name, lx, ly ) \
-static int name( pixel *pix1, int i_stride_pix1,  \
-                 pixel *pix2, int i_stride_pix2 ) \
+static int name( pixel *pix1, intptr_t i_stride_pix1,  \
+                 pixel *pix2, intptr_t i_stride_pix2 ) \
 {                                                   \
     int i_sum = 0;                                  \
     for( int y = 0; y < ly; y++ )                   \
@@ -76,8 +76,8 @@ PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
  * pixel_ssd_WxH
  ****************************************************************************/
 #define PIXEL_SSD_C( name, lx, ly ) \
-static int name( pixel *pix1, int i_stride_pix1,  \
-                 pixel *pix2, int i_stride_pix2 ) \
+static int name( pixel *pix1, intptr_t i_stride_pix1,  \
+                 pixel *pix2, intptr_t i_stride_pix2 ) \
 {                                                   \
     int i_sum = 0;                                  \
     for( int y = 0; y < ly; y++ )                   \
@@ -102,7 +102,8 @@ PIXEL_SSD_C( x264_pixel_ssd_4x16,   4, 16 )
 PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
 PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
 
-uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
+uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1,
+                             pixel *pix2, intptr_t i_pix2, int i_width, int i_height )
 {
     uint64_t i_ssd = 0;
     int y;
@@ -142,7 +143,8 @@ uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1,
     return i_ssd;
 }
 
-static void pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
+static void pixel_ssd_nv12_core( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2,
+                                 int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
 {
     *ssd_u = 0, *ssd_v = 0;
     for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
@@ -155,7 +157,8 @@ static void pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int
         }
 }
 
-void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
+void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+                          int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v )
 {
     pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height, ssd_u, ssd_v );
     if( i_width&7 )
@@ -171,7 +174,7 @@ void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pi
  * pixel_var_wxh
  ****************************************************************************/
 #define PIXEL_VAR_C( name, w, h ) \
-static uint64_t name( pixel *pix, int i_stride ) \
+static uint64_t name( pixel *pix, intptr_t i_stride ) \
 {                                             \
     uint32_t sum = 0, sqr = 0;                \
     for( int y = 0; y < h; y++ )              \
@@ -194,7 +197,7 @@ PIXEL_VAR_C( x264_pixel_var_8x8,    8,  8 )
  * pixel_var2_wxh
  ****************************************************************************/
 #define PIXEL_VAR2_C( name, w, h, shift ) \
-static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \
+static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
 { \
     uint32_t var = 0, sum = 0, sqr = 0; \
     for( int y = 0; y < h; y++ ) \
@@ -249,7 +252,7 @@ static ALWAYS_INLINE sum2_t abs2( sum2_t a )
  * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
  ****************************************************************************/
 
-static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
 {
     sum2_t tmp[4][2];
     sum2_t a0, a1, a2, a3, b0, b1;
@@ -274,7 +277,7 @@ static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, i
     return sum >> 1;
 }
 
-static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
 {
     sum2_t tmp[4][4];
     sum2_t a0, a1, a2, a3;
@@ -296,7 +299,7 @@ static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, i
 }
 
 #define PIXEL_SATD_C( w, h, sub )\
-static int x264_pixel_satd_##w##x##h( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )\
+static int x264_pixel_satd_##w##x##h( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )\
 {\
     int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
             + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
@@ -318,7 +321,7 @@ PIXEL_SATD_C( 8,  8,  x264_pixel_satd_8x4 )
 PIXEL_SATD_C( 4,  16, x264_pixel_satd_4x4 )
 PIXEL_SATD_C( 4,  8,  x264_pixel_satd_4x4 )
 
-static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static NOINLINE int sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
 {
     sum2_t tmp[8][4];
     sum2_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
@@ -352,13 +355,13 @@ static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
     return sum;
 }
 
-static int x264_pixel_sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
 {
     int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
     return (sum+2)>>2;
 }
 
-static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
+static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 )
 {
     int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
             + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
@@ -368,7 +371,7 @@ static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pi
 }
 
 
-static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride )
+static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
 {
     sum2_t tmp[32];
     sum2_t a0, a1, a2, a3, dc;
@@ -406,7 +409,7 @@ static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride )
 }
 
 #define HADAMARD_AC(w,h) \
-static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, int stride )\
+static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, intptr_t stride )\
 {\
     uint64_t sum = pixel_hadamard_ac( pix, stride );\
     if( w==16 )\
@@ -427,13 +430,15 @@ HADAMARD_AC( 8, 8 )
  * pixel_sad_x4
  ****************************************************************************/
 #define SAD_X( size ) \
-static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
+static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
+                                      intptr_t i_stride, int scores[3] )\
 {\
     scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
     scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
     scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
 }\
-static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
+static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3,\
+                                      intptr_t i_stride, int scores[4] )\
 {\
     scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
     scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
@@ -464,13 +469,15 @@ SAD_X( 8x8_vis )
  ****************************************************************************/
 
 #define SATD_X( size, cpu ) \
-static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
+static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2,\
+                                            intptr_t i_stride, int scores[3] )\
 {\
     scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
     scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
     scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
 }\
-static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
+static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3,\
+                                            intptr_t i_stride, int scores[4] )\
 {\
     scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
     scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
@@ -576,9 +583,9 @@ INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _xop, _mmx2 )
 /****************************************************************************
  * structural similarity metric
  ****************************************************************************/
-static void ssim_4x4x2_core( const pixel *pix1, int stride1,
-                             const pixel *pix2, int stride2,
-                             int sums[2][4])
+static void ssim_4x4x2_core( const pixel *pix1, intptr_t stride1,
+                             const pixel *pix2, intptr_t stride2,
+                             int sums[2][4] )
 {
     for( int z = 0; z < 2; z++ )
     {
@@ -640,8 +647,8 @@ static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
 }
 
 float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
-                           pixel *pix1, int stride1,
-                           pixel *pix2, int stride2,
+                           pixel *pix1, intptr_t stride1,
+                           pixel *pix2, intptr_t stride2,
                            int width, int height, void *buf, int *cnt )
 {
     int z = 0;
@@ -665,7 +672,7 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
     return ssim;
 }
 
-static int pixel_vsad( pixel *src, int stride, int height )
+static int pixel_vsad( pixel *src, intptr_t stride, int height )
 {
     int score = 0;
     for( int i = 1; i < height; i++, src += stride )
diff --git a/common/pixel.h b/common/pixel.h
index b3935726..50589137 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -30,9 +30,9 @@
 
 // SSD assumes all args aligned
 // other cmp functions assume first arg aligned
-typedef int  (*x264_pixel_cmp_t) ( pixel *, int, pixel *, int );
-typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, int, int[3] );
-typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int[4] );
+typedef int  (*x264_pixel_cmp_t) ( pixel *, intptr_t, pixel *, intptr_t );
+typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, intptr_t, int[3] );
+typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int[4] );
 
 enum
 {
@@ -88,18 +88,18 @@ typedef struct
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
     x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
-    int (*vsad)( pixel *, int, int );
+    int (*vsad)( pixel *, intptr_t, int );
 
-    uint64_t (*var[4])( pixel *pix, int stride );
-    int (*var2[4])( pixel *pix1, int stride1,
-                    pixel *pix2, int stride2, int *ssd );
-    uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
+    uint64_t (*var[4])( pixel *pix, intptr_t stride );
+    int (*var2[4])( pixel *pix1, intptr_t stride1,
+                    pixel *pix2, intptr_t stride2, int *ssd );
+    uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride );
 
-    void (*ssd_nv12_core)( pixel *pixuv1, int stride1,
-                           pixel *pixuv2, int stride2, int width, int height,
+    void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1,
+                           pixel *pixuv2, intptr_t stride2, int width, int height,
                            uint64_t *ssd_u, uint64_t *ssd_v );
-    void (*ssim_4x4x2_core)( const pixel *pix1, int stride1,
-                             const pixel *pix2, int stride2, int sums[2][4] );
+    void (*ssim_4x4x2_core)( const pixel *pix1, intptr_t stride1,
+                             const pixel *pix2, intptr_t stride2, int sums[2][4] );
     float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
 
     /* multiple parallel calls to cmp. */
@@ -143,9 +143,12 @@ typedef struct
 } x264_pixel_function_t;
 
 void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
-void x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v );
-uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
-float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf, int *cnt );
+void x264_pixel_ssd_nv12   ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+                             int i_width, int i_height, uint64_t *ssd_u, uint64_t *ssd_v );
+uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+                             int i_width, int i_height );
+float x264_pixel_ssim_wxh  ( x264_pixel_function_t *pf, pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2,
+                             int i_width, int i_height, void *buf, int *cnt );
 int x264_field_vsad( x264_t *h, int mb_x, int mb_y );
 
 #endif
diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c
index a9e862e3..dea872ba 100644
--- a/common/ppc/deblock.c
+++ b/common/ppc/deblock.c
@@ -267,7 +267,7 @@ static inline vec_u8_t h264_deblock_q1( register vec_u8_t p0, register vec_u8_t
     q1 = newq1;                                                                              \
 }
 
-void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
     if( (tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0 )
     {
@@ -285,7 +285,7 @@ void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta,
     }
 }
 
-void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 {
 
     register vec_u8_t line0, line1, line2, line3, line4, line5;
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 0fc735c2..2e720f47 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -37,8 +37,8 @@
 #include "ppccommon.h"
 
 #if !HIGH_BIT_DEPTH
-typedef void (*pf_mc_t)( uint8_t *src, int i_src,
-                         uint8_t *dst, int i_dst, int i_height );
+typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
+                         uint8_t *dst, intptr_t i_dst, int i_height );
 
 
 static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
@@ -58,8 +58,8 @@ static inline int x264_tapfilter1( uint8_t *pix )
 }
 
 
-static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  int i_dst,
-                                               uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  intptr_t i_dst,
+                                               uint8_t *src1, intptr_t i_src1,
                                                uint8_t *src2, int i_height )
 {
     for( int y = 0; y < i_height; y++ )
@@ -72,8 +72,8 @@ static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  int i_dst,
     }
 }
 
-static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  int i_dst,
-                                               uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  intptr_t i_dst,
+                                               uint8_t *src1, intptr_t i_src1,
                                                uint8_t *src2, int i_height )
 {
     vec_u8_t src1v, src2v;
@@ -95,8 +95,8 @@ static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst,  int i_dst,
     }
 }
 
-static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  int i_dst,
-                                                uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  intptr_t i_dst,
+                                                uint8_t *src1, intptr_t i_src1,
                                                 uint8_t *src2, int i_height )
 {
     vec_u8_t src1v, src2v;
@@ -117,8 +117,8 @@ static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst,  int i_dst,
     }
 }
 
-static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  int i_dst,
-                                                uint8_t *src1, int i_src1,
+static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  intptr_t i_dst,
+                                                uint8_t *src1, intptr_t i_src1,
                                                 uint8_t *src2, int i_height )
 {
     x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
@@ -128,8 +128,8 @@ static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst,  int i_dst,
 /* mc_copy: plain c */
 
 #define MC_COPY( name, a )                                \
-static void name( uint8_t *dst, int i_dst,                \
-                  uint8_t *src, int i_src, int i_height ) \
+static void name( uint8_t *dst, intptr_t i_dst,           \
+                  uint8_t *src, intptr_t i_src, int i_height ) \
 {                                                         \
     int y;                                                \
     for( y = 0; y < i_height; y++ )                       \
@@ -142,14 +142,14 @@ static void name( uint8_t *dst, int i_dst,                \
 MC_COPY( x264_mc_copy_w4_altivec,  4  )
 MC_COPY( x264_mc_copy_w8_altivec,  8  )
 
-static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
-                                      uint8_t *src, int i_src, int i_height )
+static void x264_mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst,
+                                      uint8_t *src, intptr_t i_src, int i_height )
 {
     vec_u8_t cpyV;
     PREP_LOAD;
     PREP_LOAD_SRC( src );
 
-    for( int y = 0; y < i_height; y++)
+    for( int y = 0; y < i_height; y++ )
     {
         VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
         vec_st(cpyV, 0, dst);
@@ -160,12 +160,12 @@ static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
 }
 
 
-static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst,
-                                              uint8_t *src, int i_src, int i_height )
+static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst,
+                                              uint8_t *src, intptr_t i_src, int i_height )
 {
-    for( int y = 0; y < i_height; ++y)
+    for( int y = 0; y < i_height; ++y )
     {
-        vec_u8_t cpyV = vec_ld( 0, src);
+        vec_u8_t cpyV = vec_ld( 0, src );
         vec_st(cpyV, 0, dst);
 
         src += i_src;
@@ -174,13 +174,13 @@ static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, int i_dst,
 }
 
 
-static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
-                             uint8_t *src[4], int i_src_stride,
+static void mc_luma_altivec( uint8_t *dst,    intptr_t i_dst_stride,
+                             uint8_t *src[4], intptr_t i_src_stride,
                              int mvx, int mvy,
                              int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
@@ -222,13 +222,13 @@ static void mc_luma_altivec( uint8_t *dst,    int i_dst_stride,
 
 
 
-static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
-                                 uint8_t *src[4], int i_src_stride,
+static uint8_t *get_ref_altivec( uint8_t *dst,   intptr_t *i_dst_stride,
+                                 uint8_t *src[4], intptr_t i_src_stride,
                                  int mvx, int mvy,
                                  int i_width, int i_height, const x264_weight_t *weight )
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
-    int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
     uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
@@ -266,10 +266,9 @@ static uint8_t *get_ref_altivec( uint8_t *dst,   int *i_dst_stride,
     }
 }
 
-static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
-                           uint8_t *src, int i_src_stride,
-                           int mvx, int mvy,
-                           int i_height )
+static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+                           uint8_t *src, intptr_t i_src_stride,
+                           int mvx, int mvy, int i_height )
 {
     uint8_t *srcp;
     int d8x = mvx&0x07;
@@ -297,10 +296,9 @@ static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
     }
  }
 
-static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
-                                   uint8_t *src, int i_src_stride,
-                                   int mvx, int mvy,
-                                   int i_height )
+static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+                                   uint8_t *src, intptr_t i_src_stride,
+                                   int mvx, int mvy, int i_height )
 {
     uint8_t *srcp;
     int d8x = mvx & 0x07;
@@ -386,10 +384,9 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, int i_dst_strid
     }
 }
 
-static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
-                                   uint8_t *src, int i_src_stride,
-                                   int mvx, int mvy,
-                                   int i_height )
+static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+                                   uint8_t *src, intptr_t i_src_stride,
+                                   int mvx, int mvy, int i_height )
 {
     uint8_t *srcp;
     int d8x = mvx & 0x07;
@@ -510,10 +507,9 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, int i_dst_strid
     }
 }
 
-static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
-                               uint8_t *src, int i_src_stride,
-                               int mvx, int mvy,
-                               int i_width, int i_height )
+static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
+                               uint8_t *src, intptr_t i_src_stride,
+                               int mvx, int mvy, int i_width, int i_height )
 {
     if( i_width == 8 )
         mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride,
@@ -670,7 +666,7 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, int i_dst_stride,
 }
 
 void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
-                               int i_stride, int i_width, int i_height, int16_t *buf )
+                               intptr_t i_stride, int i_width, int i_height, int16_t *buf )
 {
     vec_u8_t destv;
     vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
@@ -765,7 +761,7 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint
 }
 
 static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-                                           int src_stride, int dst_stride, int width, int height )
+                                            intptr_t src_stride, intptr_t dst_stride, int width, int height )
 {
     int w = width >> 4;
     int end = (width & 15);
@@ -857,7 +853,7 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
     }
 }
 
-static void mc_weight_w2_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
                                   const x264_weight_t *weight, int i_height )
 {
     LOAD_ZERO;
@@ -911,7 +907,7 @@ static void mc_weight_w2_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_s
         }
     }
 }
-static void mc_weight_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
                                   const x264_weight_t *weight, int i_height )
 {
     LOAD_ZERO;
@@ -965,7 +961,7 @@ static void mc_weight_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_s
         }
     }
 }
-static void mc_weight_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
                                   const x264_weight_t *weight, int i_height )
 {
     LOAD_ZERO;
@@ -1020,7 +1016,7 @@ static void mc_weight_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_s
         }
     }
 }
-static void mc_weight_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
                                    const x264_weight_t *weight, int i_height )
 {
     LOAD_ZERO;
@@ -1080,7 +1076,7 @@ static void mc_weight_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_
         }
     }
 }
-static void mc_weight_w20_altivec( uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
                                    const x264_weight_t *weight, int i_height )
 {
     LOAD_ZERO;
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index b60bfaf0..585bc197 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -34,8 +34,8 @@
  **********************************************************************/
 
 #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b )        \
-static int name( uint8_t *pix1, int i_pix1,            \
-                 uint8_t *pix2, int i_pix2 )           \
+static int name( uint8_t *pix1, intptr_t i_pix1,       \
+                 uint8_t *pix2, intptr_t i_pix2 )      \
 {                                                      \
     ALIGNED_16( int sum );                             \
                                                        \
@@ -119,8 +119,8 @@ PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
 /***********************************************************************
  * SATD 4x4
  **********************************************************************/
-static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
-                                   uint8_t *pix2, int i_pix2 )
+static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                   uint8_t *pix2, intptr_t i_pix2 )
 {
     ALIGNED_16( int i_satd );
 
@@ -164,8 +164,8 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
 /***********************************************************************
  * SATD 4x8
  **********************************************************************/
-static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
-                                   uint8_t *pix2, int i_pix2 )
+static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                   uint8_t *pix2, intptr_t i_pix2 )
 {
     ALIGNED_16( int i_satd );
 
@@ -218,8 +218,8 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
 /***********************************************************************
  * SATD 8x4
  **********************************************************************/
-static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
-                                   uint8_t *pix2, int i_pix2 )
+static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                   uint8_t *pix2, intptr_t i_pix2 )
 {
     ALIGNED_16( int i_satd );
 
@@ -272,8 +272,8 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
 /***********************************************************************
  * SATD 8x8
  **********************************************************************/
-static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
-                                   uint8_t *pix2, int i_pix2 )
+static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                   uint8_t *pix2, intptr_t i_pix2 )
 {
     ALIGNED_16( int i_satd );
 
@@ -332,8 +332,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
 /***********************************************************************
  * SATD 8x16
  **********************************************************************/
-static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
-                                    uint8_t *pix2, int i_pix2 )
+static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                    uint8_t *pix2, intptr_t i_pix2 )
 {
     ALIGNED_16( int i_satd );
 
@@ -416,8 +416,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
 /***********************************************************************
  * SATD 16x8
  **********************************************************************/
-static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
-                                    uint8_t *pix2, int i_pix2 )
+static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                    uint8_t *pix2, intptr_t i_pix2 )
 {
     ALIGNED_16( int i_satd );
 
@@ -500,8 +500,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
 /***********************************************************************
  * SATD 16x16
  **********************************************************************/
-static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
-                                     uint8_t *pix2, int i_pix2 )
+static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                     uint8_t *pix2, intptr_t i_pix2 )
 {
     ALIGNED_16( int i_satd );
 
@@ -632,7 +632,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
 static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
                                         uint8_t *pix0, uint8_t *pix1,
                                         uint8_t *pix2, uint8_t *pix3,
-                                        int i_stride, int scores[4] )
+                                        intptr_t i_stride, int scores[4] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -744,7 +744,7 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
 
 static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
                                         uint8_t *pix1, uint8_t *pix2,
-                                        int i_stride, int scores[3] )
+                                        intptr_t i_stride, int scores[3] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -834,7 +834,8 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
     scores[2] = sum2;
 }
 
-static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
+static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,
+                                       uint8_t *pix3, intptr_t i_stride, int scores[4] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -945,7 +946,7 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi
 
 static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
                                        uint8_t *pix1, uint8_t *pix2,
-                                       int i_stride, int scores[3] )
+                                       intptr_t i_stride, int scores[3] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -1038,7 +1039,7 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
 static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
                                        uint8_t *pix0, uint8_t *pix1,
                                        uint8_t *pix2, uint8_t *pix3,
-                                       int i_stride, int scores[4] )
+                                       intptr_t i_stride, int scores[4] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -1152,7 +1153,7 @@ static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
 
 static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
                                        uint8_t *pix1, uint8_t *pix2,
-                                       int i_stride, int scores[3] )
+                                       intptr_t i_stride, int scores[3] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -1247,7 +1248,7 @@ static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
 static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
                                       uint8_t *pix0, uint8_t *pix1,
                                       uint8_t *pix2, uint8_t *pix3,
-                                      int i_stride, int scores[4] )
+                                      intptr_t i_stride, int scores[4] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -1361,7 +1362,7 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
 
 static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
                                       uint8_t *pix1, uint8_t *pix2,
-                                      int i_stride, int scores[3] )
+                                      intptr_t i_stride, int scores[3] )
 {
     ALIGNED_16( int sum0 );
     ALIGNED_16( int sum1 );
@@ -1457,8 +1458,8 @@ static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
 * SSD routines
 **********************************************************************/
 
-static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
-                                     uint8_t *pix2, int i_stride_pix2)
+static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
+                                     uint8_t *pix2, intptr_t i_stride_pix2 )
 {
     ALIGNED_16( int sum );
 
@@ -1536,8 +1537,8 @@ static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
     return sum;
 }
 
-static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
-                                   uint8_t *pix2, int i_stride_pix2)
+static int pixel_ssd_8x8_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
+                                   uint8_t *pix2, intptr_t i_stride_pix2 )
 {
     ALIGNED_16( int sum );
 
@@ -1588,7 +1589,7 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
 /****************************************************************************
  * variance
  ****************************************************************************/
-static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride )
 {
     ALIGNED_16(uint32_t sum_tab[4]);
     ALIGNED_16(uint32_t sqr_tab[4]);
@@ -1615,7 +1616,7 @@ static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
     return sum + ((uint64_t)sqr<<32);
 }
 
-static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride )
 {
     ALIGNED_16(uint32_t sum_tab[4]);
     ALIGNED_16(uint32_t sqr_tab[4]);
@@ -1713,8 +1714,8 @@ static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
     sa8d7v = vec_sub(b6v, b7v);                           \
 }
 
-static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1,
-                                        uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                        uint8_t *pix2, intptr_t i_pix2 )
 {
     int32_t i_satd=0;
 
@@ -1781,21 +1782,21 @@ static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1,
     return i_satd;
 }
 
-static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1,
-                                   uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                   uint8_t *pix2, intptr_t i_pix2 )
 {
     int32_t i_satd;
     i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
     return i_satd;
 }
 
-static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
-                                     uint8_t *pix2, int i_pix2 )
+static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
+                                     uint8_t *pix2, intptr_t i_pix2 )
 {
     int32_t i_satd;
 
-    i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0],     i_pix1, &pix2[0],     i_pix2 )
-            + pixel_sa8d_8x8_core_altivec( &pix1[8],     i_pix1, &pix2[8],     i_pix2 )
+    i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0],          i_pix1, &pix2[0],          i_pix2 )
+            + pixel_sa8d_8x8_core_altivec( &pix1[8],          i_pix1, &pix2[8],          i_pix2 )
             + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1],   i_pix1, &pix2[8*i_pix2],   i_pix2 )
             + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
     return i_satd;
@@ -1817,7 +1818,7 @@ static int pixel_sa8d_16x16_altivec( uint8_t *pix1, int i_pix1,
     vec_s16_t pix16_s##num = (vec_s16_t)vec_perm(pix8_##num, zero_u8v, perm); \
     vec_s16_t pix16_d##num;
 
-static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
+static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm )
 {
     ALIGNED_16( int32_t sum4_tab[4] );
     ALIGNED_16( int32_t sum8_tab[4] );
@@ -1903,7 +1904,7 @@ static const vec_u8_t hadamard_permtab[] =
        0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
  };
 
-static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride )
 {
     int idx =  ((uintptr_t)pix & 8) >> 3;
     vec_u8_t permh = hadamard_permtab[idx];
@@ -1915,7 +1916,7 @@ static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, int stride )
     return ((sum>>34)<<32) + ((uint32_t)sum>>1);
 }
 
-static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride )
 {
     int idx =  ((uintptr_t)pix & 8) >> 3;
     vec_u8_t permh = hadamard_permtab[idx];
@@ -1925,7 +1926,7 @@ static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, int stride )
     return ((sum>>34)<<32) + ((uint32_t)sum>>1);
 }
 
-static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride )
 {
     vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
     uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
@@ -1933,7 +1934,7 @@ static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, int stride )
     return ((sum>>34)<<32) + ((uint32_t)sum>>1);
 }
 
-static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, int stride )
+static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride )
 {
     vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
     uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
@@ -1944,8 +1945,8 @@ static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, int stride )
 /****************************************************************************
  * structural similarity metric
  ****************************************************************************/
-static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
-                                     const uint8_t *pix2, int stride2,
+static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1,
+                                     const uint8_t *pix2, intptr_t stride2,
                                      int sums[2][4] )
 {
     ALIGNED_16( int temp[4] );
@@ -1986,13 +1987,15 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
 }
 
 #define SATD_X( size ) \
-static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
+                                            intptr_t i_stride, int scores[3] )\
 {\
     scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
     scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
     scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
 }\
-static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
+                                            uint8_t *pix3, intptr_t i_stride, int scores[4] )\
 {\
     scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
     scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
diff --git a/common/sparc/pixel.h b/common/sparc/pixel.h
index 32498db4..0c762d4a 100644
--- a/common/sparc/pixel.h
+++ b/common/sparc/pixel.h
@@ -26,9 +26,9 @@
 #ifndef X264_SPARC_PIXEL_H
 #define X264_SPARC_PIXEL_H
 
-int x264_pixel_sad_8x8_vis( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_8x16_vis( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_16x8_vis( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_16x16_vis( uint8_t *, int, uint8_t *, int );
+int x264_pixel_sad_8x8_vis  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_8x16_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_16x8_vis ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_16x16_vis( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
 #endif
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index 7622eb69..e7c5dc31 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -160,7 +160,7 @@ cextern pw_pixel_max
 
 %macro DEBLOCK_LUMA 0
 ;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_luma, 5,5,8
     %assign pad 5*mmsize+12-(stack_offset&15)
@@ -603,7 +603,7 @@ DEBLOCK_LUMA_64
 
 %if ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_LUMA_INTRA_64 0
 cglobal deblock_v_luma_intra, 4,7,16
@@ -653,7 +653,7 @@ cglobal deblock_v_luma_intra, 4,7,16
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_luma_intra, 4,7,16
     %define t0 m15
@@ -722,7 +722,7 @@ DEBLOCK_LUMA_INTRA_64
 
 %macro DEBLOCK_LUMA_INTRA 0
 ;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_luma_intra, 4,7,8
     LUMA_INTRA_INIT 3
@@ -748,7 +748,7 @@ cglobal deblock_v_luma_intra, 4,7,8
     RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_luma_intra, 4,7,8
     LUMA_INTRA_INIT 8
@@ -1090,7 +1090,7 @@ DEBLOCK_LUMA_INTRA
 
 %if ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_LUMA 0
 cglobal deblock_v_luma, 5,5,10
@@ -1135,12 +1135,11 @@ cglobal deblock_v_luma, 5,5,10
     RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX cpuname
 cglobal deblock_h_luma, 5,9
-    movsxd r7, r1d
-    lea    r8, [r7*3]
+    lea    r8, [r1*3]
     lea    r6, [r0-4]
     lea    r5, [r0-4+r8]
 %if WIN64
@@ -1152,14 +1151,15 @@ cglobal deblock_h_luma, 5,9
 %endif
 
     ; transpose 6x16 -> tmp space
-    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp
-    lea    r6, [r6+r7*8]
-    lea    r5, [r5+r7*8]
-    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
+    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r1, r8), pix_tmp
+    lea    r6, [r6+r1*8]
+    lea    r5, [r5+r1*8]
+    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r1, r8), pix_tmp+8
 
     ; vertical filter
     ; alpha, beta, tc0 are still in r2d, r3d, r4
     ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
+    mov    r7, r1
     lea    r0, [pix_tmp+0x30]
     mov    r1d, 0x10
 %if WIN64
@@ -1203,7 +1203,7 @@ DEBLOCK_LUMA
 
 %macro DEBLOCK_LUMA 2
 ;-----------------------------------------------------------------------------
-; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_%1_luma, 5,5
     lea     r4, [r1*3]
@@ -1255,7 +1255,7 @@ cglobal deblock_%1_luma, 5,5
     RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 INIT_MMX cpuname
 cglobal deblock_h_luma, 0,5
@@ -1452,7 +1452,7 @@ DEBLOCK_LUMA v, 16
 %endif
 
 ;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_%1_luma_intra, 4,6,16
 %if ARCH_X86_64 == 0
@@ -1514,24 +1514,24 @@ cglobal deblock_%1_luma_intra, 4,6,16
 INIT_MMX cpuname
 %if ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_luma_intra, 4,9
-    movsxd r7, r1d
-    lea    r8, [r7*3]
+    lea    r8, [r1*3]
     lea    r6, [r0-4]
     lea    r5, [r0-4+r8]
     sub   rsp, 0x88
     %define pix_tmp rsp
 
     ; transpose 8x16 -> tmp space
-    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
-    lea    r6, [r6+r7*8]
-    lea    r5, [r5+r7*8]
-    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
-
-    lea    r0,  [pix_tmp+0x40]
-    mov    r1,  0x10
+    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+    lea    r6, [r6+r1*8]
+    lea    r5, [r5+r1*8]
+    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+    mov    r7, r1
+    lea    r0, [pix_tmp+0x40]
+    mov    r1, 0x10
     call   deblock_v_luma_intra
 
     ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
@@ -1685,9 +1685,9 @@ cglobal deblock_inter_body
     ret
 
 ;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma, 7,7,8
+cglobal deblock_v_chroma, 5,7,8
     FIX_STRIDES r1
     mov         r5, r0
     sub         r0, r1
@@ -1705,7 +1705,7 @@ cglobal deblock_v_chroma, 7,7,8
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma, 5,7,8
     add         r1, r1
@@ -1731,7 +1731,7 @@ cglobal deblock_intra_body
     ret
 
 ;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma_intra, 4,6,8
     add         r1, r1
@@ -1752,7 +1752,7 @@ cglobal deblock_v_chroma_intra, 4,6,8
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_intra, 4,6,8
     add         r1, r1
@@ -1770,7 +1770,7 @@ cglobal deblock_h_chroma_intra, 4,6,8
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_intra_mbaff, 4,6,8
     add         r1, r1
@@ -1793,7 +1793,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_mbaff, 5,7,8
     add         r1, r1
@@ -1821,7 +1821,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_422_intra, 4,6,8
     add         r1, r1
@@ -1839,7 +1839,7 @@ cglobal deblock_h_chroma_422_intra, 4,6,8
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_422, 5,7,8
     add         r1, r1
@@ -1940,7 +1940,7 @@ cglobal chroma_inter_body
     ret
 
 ;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma, 5,6,8
     CHROMA_V_START
@@ -1955,7 +1955,7 @@ cglobal deblock_v_chroma, 5,6,8
     RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma, 5,7,8
     CHROMA_H_START
@@ -1980,7 +1980,7 @@ DEBLOCK_CHROMA
 %endif
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_H_CHROMA_420_MBAFF 0
 cglobal deblock_h_chroma_mbaff, 5,7,8
@@ -2076,7 +2076,7 @@ cglobal chroma_intra_body
 
 %macro DEBLOCK_CHROMA_INTRA 0
 ;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_v_chroma_intra, 4,5,8
     CHROMA_V_START
@@ -2091,7 +2091,7 @@ cglobal deblock_v_chroma_intra, 4,5,8
     RET
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 cglobal deblock_h_chroma_intra, 4,6,8
     CHROMA_H_START
@@ -2132,7 +2132,7 @@ DEBLOCK_CHROMA_INTRA
 %endif
 
 ;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta )
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal deblock_h_chroma_intra_mbaff, 4,6,8
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 689999ee..923a2cd3 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -61,8 +61,7 @@ cextern pd_32
 %if WIN64
     DECLARE_REG_TMP 0,1,2,3,4,5,4,5
     %macro AVG_START 0-1 0
-        PROLOGUE 5,7,%1
-        movsxd r5, dword r5m
+        PROLOGUE 6,7,%1
     %endmacro
 %elif UNIX64
     DECLARE_REG_TMP 0,1,2,3,4,5,7,8
@@ -190,7 +189,7 @@ cextern pd_32
 %endif ;HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
-; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
+; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
 ;-----------------------------------------------------------------------------
 %macro AVG_WEIGHT 1-2 0
 cglobal pixel_avg_weight_w%1
@@ -403,7 +402,7 @@ AVG_WEIGHT 16, 7
 %endif ; HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
-;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
+;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
 ;-----------------------------------------------------------------------------
 
 %macro WEIGHTER 1
@@ -479,7 +478,7 @@ WEIGHTER 20
 %endmacro
 
 ;-----------------------------------------------------------------------------
-;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
+;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
 ;-----------------------------------------------------------------------------
 %macro OFFSET 2
 cglobal mc_offset%2_w%1, 6,6
@@ -524,8 +523,8 @@ OFFSETPN  8
 ;=============================================================================
 
 ;-----------------------------------------------------------------------------
-; void pixel_avg_4x4( pixel *dst, int dst_stride,
-;                     pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
+; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+;                     pixel *src2, intptr_t src2_stride, int weight );
 ;-----------------------------------------------------------------------------
 %macro AVGH 2
 cglobal pixel_avg_%1x%2
@@ -540,9 +539,8 @@ cglobal pixel_avg_%1x%2
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void pixel_avg_w4( pixel *dst, int dst_stride,
-;                    pixel *src1, int src1_stride, pixel *src2, int src2_stride,
-;                    int height, int weight );
+; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+;                    pixel *src2, intptr_t src2_stride, int height, int weight );
 ;-----------------------------------------------------------------------------
 
 %macro AVG_FUNC 3
@@ -648,8 +646,8 @@ AVGH  4,  2
 
 %if HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
-; void pixel_avg2_wN( uint16_t *dst,  int dst_stride,
-;                     uint16_t *src1, int src_stride,
+; void pixel_avg2_wN( uint16_t *dst,  intptr_t dst_stride,
+;                     uint16_t *src1, intptr_t src_stride,
 ;                     uint16_t *src2, int height );
 ;-----------------------------------------------------------------------------
 %macro AVG2_W_ONE 1
@@ -832,8 +830,8 @@ cglobal pixel_avg2_w18_sse2, 6,7,6
 
 %if HIGH_BIT_DEPTH == 0
 ;-----------------------------------------------------------------------------
-; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
-;                     uint8_t *src1, int src_stride,
+; void pixel_avg2_w4( uint8_t *dst,  intptr_t dst_stride,
+;                     uint8_t *src1, intptr_t src_stride,
 ;                     uint8_t *src2, int height );
 ;-----------------------------------------------------------------------------
 %macro AVG2_W8 2
@@ -1194,8 +1192,8 @@ AVG16_CACHELINE_LOOP_SSSE3 j, k
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
-;                  uint8_t *src, int i_src_stride, int i_height )
+; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
+;                  uint8_t *src, intptr_t i_src_stride, int i_height )
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal mc_copy_w4_mmx, 4,6
@@ -1250,14 +1248,14 @@ MC_COPY 16
 ; FIXME doesn't cover all pixels in high depth and/or 4:4:4
 
 ;-----------------------------------------------------------------------------
-; void prefetch_fenc( pixel *pix_y, int stride_y,
-;                     pixel *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y,  intptr_t stride_y,
+;                     pixel *pix_uv, intptr_t stride_uv, int mb_x )
 ;-----------------------------------------------------------------------------
 
 %macro PREFETCH_FENC 1
 %if ARCH_X86_64
 cglobal prefetch_fenc_%1, 5,5
-    FIX_STRIDES r1d, r3d
+    FIX_STRIDES r1, r3
     and    r4d, 3
     mov    eax, r4d
     imul   r4d, r1d
@@ -1317,11 +1315,11 @@ PREFETCH_FENC 420
 PREFETCH_FENC 422
 
 ;-----------------------------------------------------------------------------
-; void prefetch_ref( pixel *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal prefetch_ref, 3,3
-    FIX_STRIDES r1d
+    FIX_STRIDES r1
     dec    r2d
     and    r2d, r1d
     lea    r0,  [r0+r2*8+64*SIZEOF_PIXEL]
@@ -1397,8 +1395,8 @@ cglobal prefetch_ref, 3,3
 %endif ; HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
-; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
-;                 uint8_t *src, int src_stride,
+; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
+;                 uint8_t *src, intptr_t src_stride,
 ;                 int dx, int dy,
 ;                 int width, int height )
 ;-----------------------------------------------------------------------------
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 0c15a416..12bec5b2 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -155,14 +155,11 @@ cextern pd_ffff
 
 %if HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
 ;-----------------------------------------------------------------------------
 %macro HPEL_FILTER 0
 cglobal hpel_filter_v, 5,6,11
-    FIX_STRIDES r3d, r4d
-%if WIN64
-    movsxd     r4, r4d
-%endif
+    FIX_STRIDES r3, r4
     lea        r5, [r1+r3]
     sub        r1, r3
     sub        r1, r3
@@ -179,7 +176,7 @@ cglobal hpel_filter_v, 5,6,11
     %define s30 [pad30]
 %endif
     add        r0, r4
-    lea        r2, [r2+r4]
+    add        r2, r4
     neg        r4
     mova       m7, [pw_pixel_max]
     pxor       m0, m0
@@ -216,12 +213,12 @@ cglobal hpel_filter_v, 5,6,11
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_c, 3,3,10
     add        r2, r2
     add        r0, r2
-    lea        r1, [r1+r2]
+    add        r1, r2
     neg        r2
     mova       m0, [tap1]
     mova       m7, [tap3]
@@ -265,7 +262,7 @@ cglobal hpel_filter_c, 3,3,10
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
+; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_h, 3,4,8
     %define src r1+r2
@@ -317,12 +314,9 @@ HPEL_FILTER
 %if HIGH_BIT_DEPTH == 0
 %macro HPEL_V 1
 ;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_v, 5,6,%1
-%if WIN64
-    movsxd   r4, r4d
-%endif
     lea r5, [r1+r3]
     sub r1, r3
     sub r1, r3
@@ -375,7 +369,7 @@ cglobal hpel_filter_v, 5,6,%1
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal hpel_filter_c_mmx2, 3,3
@@ -405,7 +399,7 @@ cglobal hpel_filter_c_mmx2, 3,3
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_h_mmx2, 3,3
     add r0, r2
@@ -452,7 +446,7 @@ INIT_XMM
 
 %macro HPEL_C 0
 ;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_c, 3,3,9
     add r0, r2
@@ -520,7 +514,7 @@ cglobal hpel_filter_c, 3,3,9
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter_h_sse2, 3,3,8
     add r0, r2
@@ -568,7 +562,7 @@ cglobal hpel_filter_h_sse2, 3,3,8
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
 ;-----------------------------------------------------------------------------
 %macro HPEL_H 0
 cglobal hpel_filter_h, 3,3
@@ -739,15 +733,11 @@ HPEL_H
 %macro HPEL 0
 ;-----------------------------------------------------------------------------
 ; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-;                   uint8_t *src, int stride, int width, int height)
+;                   uint8_t *src, intptr_t stride, int width, int height )
 ;-----------------------------------------------------------------------------
 cglobal hpel_filter, 7,9,16
-%if WIN64
-    movsxd   r4, r4d
-    movsxd   r5, r5d
-%endif
     mov       r7, r3
-    sub       r5, 16
+    sub      r5d, 16
     mov       r8, r1
     and       r7, 15
     sub       r3, r7
@@ -815,21 +805,20 @@ HPEL
 %endif ; !HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
-; void plane_copy_core( pixel *dst, int i_dst,
-;                       pixel *src, int i_src, int w, int h)
+; void plane_copy_core( pixel *dst, intptr_t i_dst,
+;                       pixel *src, intptr_t i_src, int w, int h )
 ;-----------------------------------------------------------------------------
 ; assumes i_dst and w are multiples of 16, and i_dst>w
 INIT_MMX
 cglobal plane_copy_core_mmx2, 6,7
-    FIX_STRIDES r1d, r3d, r4d
-    movsxdifnidn r1, r1d
-    movsxdifnidn r3, r3d
+    FIX_STRIDES r1, r3, r4d
+%if HIGH_BIT_DEPTH == 0
     movsxdifnidn r4, r4d
+%endif
     sub    r1,  r4
     sub    r3,  r4
 .loopy:
-    mov    r6d, r4d
-    sub    r6d, 63
+    lea   r6d, [r4-63]
 .loopx:
     prefetchnta [r2+256]
     movq   m0, [r2   ]
@@ -958,22 +947,19 @@ cglobal plane_copy_core_mmx2, 6,7
 
 %macro PLANE_INTERLEAVE 0
 ;-----------------------------------------------------------------------------
-; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
-;                                  uint8_t *srcu, int i_srcu,
-;                                  uint8_t *srcv, int i_srcv, int w, int h )
+; void plane_copy_interleave_core( uint8_t *dst,  intptr_t i_dst,
+;                                  uint8_t *srcu, intptr_t i_srcu,
+;                                  uint8_t *srcv, intptr_t i_srcv, int w, int h )
 ;-----------------------------------------------------------------------------
 ; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core, 7,9
-    FIX_STRIDES r1d, r3d, r5d, r6d
+cglobal plane_copy_interleave_core, 6,9
+    mov   r6d, r6m
 %if HIGH_BIT_DEPTH
-    mov   r1m, r1d
-    mov   r3m, r3d
-    mov   r6m, r6d
+    FIX_STRIDES r1, r3, r5, r6d
+    movifnidn r1mp, r1
+    movifnidn r3mp, r3
+    mov  r6m, r6d
 %endif
-    movsxdifnidn r1, r1d
-    movsxdifnidn r3, r3d
-    movsxdifnidn r5, r5d
-    movsxdifnidn r6, r6d
     lea    r0, [r0+r6*2]
     add    r2,  r6
     add    r4,  r6
@@ -1028,10 +1014,10 @@ cglobal plane_copy_interleave_core, 7,9
     RET
 
 ;-----------------------------------------------------------------------------
-; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
+; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
 ;-----------------------------------------------------------------------------
 cglobal store_interleave_chroma, 5,5
-    FIX_STRIDES r1d
+    FIX_STRIDES r1
 .loop:
     INTERLEAVE r0+ 0, r2+           0, r3+           0, a
     INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
@@ -1055,20 +1041,17 @@ cglobal store_interleave_chroma, 5,5
 
 %macro PLANE_DEINTERLEAVE 0
 ;-----------------------------------------------------------------------------
-; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
-;                               pixel *dstv, int i_dstv,
-;                               pixel *src, int i_src, int w, int h )
+; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
+;                               pixel *dstv, intptr_t i_dstv,
+;                               pixel *src,  intptr_t i_src, int w, int h )
 ;-----------------------------------------------------------------------------
 cglobal plane_copy_deinterleave, 6,7
     DEINTERLEAVE_START
     mov    r6d, r6m
-    FIX_STRIDES r1d, r3d, r5d, r6d
+    FIX_STRIDES r1, r3, r5, r6d
 %if HIGH_BIT_DEPTH
     mov    r6m, r6d
 %endif
-    movsxdifnidn r1, r1d
-    movsxdifnidn r3, r3d
-    movsxdifnidn r5, r5d
     add    r0,  r6
     add    r2,  r6
     lea    r4, [r4+r6*2]
@@ -1088,11 +1071,11 @@ cglobal plane_copy_deinterleave, 6,7
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
 ;-----------------------------------------------------------------------------
 cglobal load_deinterleave_chroma_fenc, 4,4
     DEINTERLEAVE_START
-    FIX_STRIDES r2d
+    FIX_STRIDES r2
 .loop:
     DEINTERLEAVE r0+           0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
     DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
@@ -1103,11 +1086,11 @@ cglobal load_deinterleave_chroma_fenc, 4,4
     REP_RET
 
 ;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
 ;-----------------------------------------------------------------------------
 cglobal load_deinterleave_chroma_fdec, 4,4
     DEINTERLEAVE_START
-    FIX_STRIDES r2d
+    FIX_STRIDES r2
 .loop:
     DEINTERLEAVE r0+           0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
     DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
@@ -1236,7 +1219,7 @@ MEMZERO
 
 %if HIGH_BIT_DEPTH == 0
 ;-----------------------------------------------------------------------------
-; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
 ;-----------------------------------------------------------------------------
 INIT_XMM
 cglobal integral_init4h_sse4, 3,4
@@ -1291,7 +1274,7 @@ INTEGRAL_INIT8H
 
 %macro INTEGRAL_INIT_8V 0
 ;-----------------------------------------------------------------------------
-; void integral_init8v( uint16_t *sum8, int stride )
+; void integral_init8v( uint16_t *sum8, intptr_t stride )
 ;-----------------------------------------------------------------------------
 cglobal integral_init8v, 3,3
     shl   r1, 1
@@ -1316,7 +1299,7 @@ INIT_XMM sse2
 INTEGRAL_INIT_8V
 
 ;-----------------------------------------------------------------------------
-; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal integral_init4v_mmx, 3,5
@@ -1505,17 +1488,14 @@ cglobal integral_init4v_ssse3, 3,5
 
 ;-----------------------------------------------------------------------------
 ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-;                              int src_stride, int dst_stride, int width, int height )
+;                              intptr_t src_stride, intptr_t dst_stride, int width, int height )
 ;-----------------------------------------------------------------------------
 %macro FRAME_INIT_LOWRES 0
 cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
 %if HIGH_BIT_DEPTH
     shl   dword r6m, 1
-    FIX_STRIDES r5d
+    FIX_STRIDES r5
     shl   dword r7m, 1
-%endif
-%if WIN64
-    movsxd    r5, r5d
 %endif
     ; src += 2*(height-1)*stride + 2*width
     mov      r6d, r8m
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 435f6bd9..8e587536 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -37,22 +37,22 @@
     void func##_sse2 args;\
     void func##_ssse3 args;
 
-DECL_SUF( x264_pixel_avg_16x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_8x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_8x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_8x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
-DECL_SUF( x264_pixel_avg_4x2,   ( pixel *, int, pixel *, int, pixel *, int, int ))
+DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_8x16,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_8x8,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_8x4,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x16,  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x8,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x4,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x264_pixel_avg_4x2,   ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
 
 #define MC_WEIGHT(w,type) \
-    void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
+    void x264_mc_weight_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );
 
 #define MC_WEIGHT_OFFSET(w,type) \
-    void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
-    void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
+    void x264_mc_offsetadd_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
+    void x264_mc_offsetsub_w##w##_##type( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); \
     MC_WEIGHT(w,type)
 
 MC_WEIGHT_OFFSET( 4, mmx2 )
@@ -75,74 +75,73 @@ MC_WEIGHT( 20, ssse3 )
 #undef MC_OFFSET
 #undef MC_WEIGHT
 
-void x264_mc_copy_w4_mmx( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w8_mmx( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_420_mmx2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_422_mmx2( pixel *, int, pixel *, int, int );
-void x264_prefetch_ref_mmx2( pixel *, int, int );
-void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
-void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
-void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
-                                             pixel *srcu, int i_srcu,
-                                             pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
-                                           pixel *srcu, int i_srcu,
-                                           pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_core_avx( pixel *dst, int i_dst,
-                                           pixel *srcu, int i_srcu,
-                                           pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
-                                   pixel *srcu, int i_srcu,
-                                   pixel *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_deinterleave_mmx( pixel *dstu, int i_dstu,
-                                       pixel *dstv, int i_dstv,
-                                       pixel *src, int i_src, int w, int h );
-void x264_plane_copy_deinterleave_sse2( pixel *dstu, int i_dstu,
-                                        pixel *dstv, int i_dstv,
-                                        pixel *src, int i_src, int w, int h );
-void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
-                                         uint8_t *dstv, int i_dstv,
-                                         uint8_t *src, int i_src, int w, int h );
-void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
-                                         uint16_t *dstv, int i_dstv,
-                                         uint16_t *src, int i_src, int w, int h );
-void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
-void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
-void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
-void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
-void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
-void x264_memzero_aligned_mmx( void * dst, int n );
-void x264_memzero_aligned_sse2( void * dst, int n );
-void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
-void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, int stride );
-void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
-void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
-void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
-void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
-void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
+void x264_mc_copy_w4_mmx  ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_mmx  ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_mmx ( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_mc_copy_w16_aligned_sse2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
+void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
+void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_sse2( pixel *dst,  intptr_t i_dst,
+                                           pixel *srcu, intptr_t i_srcu,
+                                           pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_avx( pixel *dst,  intptr_t i_dst,
+                                          pixel *srcu, intptr_t i_srcu,
+                                          pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
+                                   pixel *srcu, intptr_t i_srcu,
+                                   pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_deinterleave_mmx( pixel *dstu, intptr_t i_dstu,
+                                       pixel *dstv, intptr_t i_dstv,
+                                       pixel *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
+                                        pixel *dstv, intptr_t i_dstv,
+                                        pixel *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
+                                         uint8_t *dstv, intptr_t i_dstv,
+                                         uint8_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
+                                       uint16_t *dstv, intptr_t i_dstv,
+                                       uint16_t *src,  intptr_t i_src, int w, int h );
+void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fenc_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_mmx ( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void *x264_memcpy_aligned_mmx ( void *dst, const void *src, size_t n );
+void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_mmx ( void *dst, size_t n );
+void x264_memzero_aligned_sse2( void *dst, size_t n );
+void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, intptr_t stride );
+void x264_integral_init4v_mmx  ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init4v_sse2 ( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, intptr_t stride );
+void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
+void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
 void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                     uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
-void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
-                           pixel *src, int i_src,\
+void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
                            int dx, int dy, int i_width, int i_height );
 MC_CHROMA(mmx2)
 MC_CHROMA(sse2)
@@ -154,7 +153,7 @@ MC_CHROMA(avx_cache64)
 
 #define LOWRES(cpu)\
 void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
-                                        int src_stride, int dst_stride, int width, int height );
+                                        intptr_t src_stride, intptr_t dst_stride, int width, int height );
 LOWRES(mmx2)
 LOWRES(cache32_mmx2)
 LOWRES(sse2)
@@ -163,7 +162,7 @@ LOWRES(avx)
 LOWRES(xop)
 
 #define PIXEL_AVG_W(width,cpu)\
-void x264_pixel_avg2_w##width##_##cpu( pixel *, int, pixel *, int, pixel *, int );
+void x264_pixel_avg2_w##width##_##cpu( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t );
 /* This declares some functions that don't exist, but that isn't a problem. */
 #define PIXEL_AVG_WALL(cpu)\
 PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
@@ -177,7 +176,7 @@ PIXEL_AVG_WALL(sse2_misalign)
 PIXEL_AVG_WALL(cache64_ssse3)
 
 #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
-static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, int, pixel *, int, pixel *, int ) =\
+static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t ) =\
 {\
     NULL,\
     x264_pixel_avg2_w4_##name1,\
@@ -216,7 +215,7 @@ PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3,
 #endif // HIGH_BIT_DEPTH
 
 #define MC_COPY_WTAB(instr, name1, name2, name3)\
-static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int, int ) =\
+static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =\
 {\
     NULL,\
     x264_mc_copy_w4_##name1,\
@@ -233,7 +232,7 @@ MC_COPY_WTAB(sse2,mmx,mmx,sse2)
 #endif
 
 #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
-    static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
+    static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
 {\
     x264_mc_##function##_w4_##name1,\
     x264_mc_##function##_w4_##name1,\
@@ -332,10 +331,10 @@ static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
 static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
 
 #define MC_LUMA(name,instr1,instr2)\
-static void mc_luma_##name( pixel *dst,    int i_dst_stride,\
-                  pixel *src[4], int i_src_stride,\
-                  int mvx, int mvy,\
-                  int i_width, int i_height, const x264_weight_t *weight )\
+static void mc_luma_##name( pixel *dst,    intptr_t i_dst_stride,\
+                            pixel *src[4], intptr_t i_src_stride,\
+                            int mvx, int mvy,\
+                            int i_width, int i_height, const x264_weight_t *weight )\
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -367,10 +366,10 @@ MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
 #endif // !HIGH_BIT_DEPTH
 
 #define GET_REF(name)\
-static pixel *get_ref_##name( pixel *dst,   int *i_dst_stride,\
-                         pixel *src[4], int i_src_stride,\
-                         int mvx, int mvy,\
-                         int i_width, int i_height, const x264_weight_t *weight )\
+static pixel *get_ref_##name( pixel *dst,   intptr_t *i_dst_stride,\
+                              pixel *src[4], intptr_t i_src_stride,\
+                              int mvx, int mvy,\
+                              int i_width, int i_height, const x264_weight_t *weight )\
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
@@ -410,13 +409,13 @@ GET_REF(cache64_ssse3)
 #endif // !HIGH_BIT_DEPTH
 
 #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
-void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, int stride, int width);\
-void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, int width );\
-void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, int width );\
+void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, intptr_t stride, intptr_t width);\
+void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, intptr_t width );\
+void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, intptr_t width );\
 static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
-                             int stride, int width, int height, int16_t *buf )\
+                                    intptr_t stride, int width, int height, int16_t *buf )\
 {\
-    int realign = (intptr_t)src & (align-1);\
+    intptr_t realign = (intptr_t)src & (align-1);\
     src -= realign;\
     dstv -= realign;\
     dstc -= realign;\
@@ -441,9 +440,9 @@ HPEL(16, sse2, sse2, sse2, sse2)
 #else // !HIGH_BIT_DEPTH
 HPEL(16, sse2_amd, mmx2, mmx2, sse2)
 #if ARCH_X86_64
-void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
-void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
-void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
+void x264_hpel_filter_avx  ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf );
 #else
 HPEL(16, sse2, sse2, sse2, sse2)
 HPEL(16, ssse3, ssse3, ssse3, ssse3)
@@ -452,7 +451,7 @@ HPEL(16, avx, avx, avx, avx)
 HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
 #endif // HIGH_BIT_DEPTH
 
-static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h )
+static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
 {
     int c_w = 16/sizeof(pixel) - 1;
     if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
@@ -470,9 +469,9 @@ static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src,
 }
 
 #define PLANE_INTERLEAVE(cpu) \
-static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
-                                              pixel *srcu, int i_srcu,\
-                                              pixel *srcv, int i_srcv, int w, int h )\
+static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
+                                              pixel *srcu, intptr_t i_srcu,\
+                                              pixel *srcv, intptr_t i_srcv, int w, int h )\
 {\
     if( !(w&15) ) {\
         x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
diff --git a/common/x86/pixel-32.asm b/common/x86/pixel-32.asm
index b3dcdbab..77a87421 100644
--- a/common/x86/pixel-32.asm
+++ b/common/x86/pixel-32.asm
@@ -67,7 +67,7 @@ INIT_MMX mmx2
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 cglobal pixel_sa8d_8x8_internal
     push   r0
@@ -362,8 +362,8 @@ cglobal intra_sa8d_x3_8x8, 2,3
 
 
 ;-----------------------------------------------------------------------------
-; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
-;                             const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
+;                             const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
 ;-----------------------------------------------------------------------------
 cglobal pixel_ssim_4x4x2_core, 0,5
     mov       r1, r1m
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 6f3076cf..06737ab1 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -144,7 +144,7 @@ cextern hsub_mul
 
 %if HIGH_BIT_DEPTH
 ;-----------------------------------------------------------------------------
-; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
+; int pixel_ssd_MxN( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SSD_ONE 2
 cglobal pixel_ssd_%1x%2, 4,5,6
@@ -361,7 +361,7 @@ SSD_ONE    16, 16
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_ssd_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SSD 2
 %if %1 != %2
@@ -466,7 +466,7 @@ SSD  8,  4
 %endif ; !HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint16_t *pixuv1, int stride1, uint16_t *pixuv2, int stride2,
+; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
 ;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
 ;
 ; The maximum width this function can handle without risk of overflow is given
@@ -560,7 +560,7 @@ cglobal pixel_ssd_nv12_core, 6,7,7
 
 %if HIGH_BIT_DEPTH == 0
 ;-----------------------------------------------------------------------------
-; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2,
+; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
 ;                           int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
 ;
 ; This implementation can potentially overflow on image widths >= 11008 (or
@@ -697,7 +697,7 @@ SSD_NV12
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_var_wxh( uint8_t *, int )
+; int pixel_var_wxh( uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal pixel_var_16x16, 2,3
@@ -820,7 +820,7 @@ VAR
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
+; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
 ;-----------------------------------------------------------------------------
 %macro VAR2_8x8_MMX 2
 cglobal pixel_var2_8x%1, 5,6
@@ -1128,7 +1128,7 @@ VAR2_8x8_SSSE3 16, 7
 ; for small blocks on x86_32, modify pixel pointer instead.
 
 ;-----------------------------------------------------------------------------
-; int pixel_satd_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_satd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
 cglobal pixel_satd_16x4_internal
@@ -1335,7 +1335,7 @@ cglobal pixel_satd_4x4, 4,6
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
+; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SATDS_SSE2 0
 %if cpuflag(ssse3)
@@ -1476,7 +1476,7 @@ cglobal pixel_satd_8x4, 4,6,8
 
 %if ARCH_X86_64
 ;-----------------------------------------------------------------------------
-; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
+; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 cglobal pixel_sa8d_8x8_internal
     lea  r6, [r0+4*r1]
@@ -3841,8 +3841,8 @@ HADAMARD_AC_SSE2
 ;=============================================================================
 
 ;-----------------------------------------------------------------------------
-; void pixel_ssim_4x4x2_core( const uint8_t *pix1, int stride1,
-;                             const uint8_t *pix2, int stride2, int sums[2][4] )
+; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
+;                             const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
 ;-----------------------------------------------------------------------------
 %macro SSIM_ITER 1
 %if HIGH_BIT_DEPTH
@@ -4006,8 +4006,10 @@ SSIM
 ;=============================================================================
 
 %macro ADS_START 0
-%if WIN64
+%if UNIX64
     movsxd  r5,  r5d
+%else
+    mov     r5d, r5m
 %endif
     mov     r0d, r5d
     lea     r6,  [r4+r5+15]
@@ -4030,7 +4032,7 @@ SSIM
 ;                 uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
 ;-----------------------------------------------------------------------------
 INIT_MMX mmx2
-cglobal pixel_ads4, 6,7
+cglobal pixel_ads4, 5,7
     movq    mm6, [r0]
     movq    mm4, [r0+8]
     pshufw  mm7, mm6, 0
@@ -4061,7 +4063,7 @@ cglobal pixel_ads4, 6,7
     movd    [r6], mm1
     ADS_END 1
 
-cglobal pixel_ads2, 6,7
+cglobal pixel_ads2, 5,7
     movq    mm6, [r0]
     pshufw  mm5, r6m, 0
     pshufw  mm7, mm6, 0
@@ -4082,7 +4084,7 @@ cglobal pixel_ads2, 6,7
     movd    [r6], mm4
     ADS_END 1
 
-cglobal pixel_ads1, 6,7
+cglobal pixel_ads1, 5,7
     pshufw  mm7, [r0], 0
     pshufw  mm6, r6m, 0
     ADS_START
@@ -4104,7 +4106,7 @@ cglobal pixel_ads1, 6,7
     ADS_END 2
 
 %macro ADS_XMM 0
-cglobal pixel_ads4, 6,7,12
+cglobal pixel_ads4, 5,7,12
     movdqa  xmm4, [r0]
     pshuflw xmm7, xmm4, 0
     pshuflw xmm6, xmm4, q2222
@@ -4168,7 +4170,7 @@ cglobal pixel_ads4, 6,7,12
 %endif ; ARCH
     ADS_END 2
 
-cglobal pixel_ads2, 6,7,8
+cglobal pixel_ads2, 5,7,8
     movq    xmm6, [r0]
     movd    xmm5, r6m
     pshuflw xmm7, xmm6, 0
@@ -4193,7 +4195,7 @@ cglobal pixel_ads2, 6,7,8
     movq    [r6], xmm1
     ADS_END 2
 
-cglobal pixel_ads1, 6,7,8
+cglobal pixel_ads1, 5,7,8
     movd    xmm7, [r0]
     movd    xmm6, r6m
     pshuflw xmm7, xmm7, 0
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 11823f08..eeea9c70 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -39,11 +39,11 @@
     ret x264_pixel_##name##_4x4_##suffix args;\
 
 #define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( pixel *, int, pixel *, int ) )
+    DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
 
 #define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, int, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int * ) )
+    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
 
 DECL_X1( sad, mmx2 )
 DECL_X1( sad, sse2 )
@@ -84,16 +84,16 @@ DECL_X4( sad, cache64_mmx2 );
 DECL_X4( sad, cache64_sse2 );
 DECL_X4( sad, cache64_ssse3 );
 
-DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, avx,  ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, var, xop,  ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, mmx2,  ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse2,  ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, sse4,  ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, avx,   ( pixel *pix, int i_stride ))
-DECL_PIXELS( uint64_t, hadamard_ac, xop,   ( pixel *pix, int i_stride ))
+DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, avx,  ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, var, xop,  ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, mmx2,  ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse2,  ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, sse4,  ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, avx,   ( pixel *pix, intptr_t i_stride ))
+DECL_PIXELS( uint64_t, hadamard_ac, xop,   ( pixel *pix, intptr_t i_stride ))
 
 
 void x264_intra_satd_x3_4x4_mmx2   ( pixel   *, pixel   *, int * );
@@ -130,35 +130,35 @@ int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, u
 int x264_intra_sad_x9_8x8_sse4  ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 int x264_intra_sad_x9_8x8_avx   ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
 
-void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, int stride1,
-                                    pixel *pixuv2, int stride2, int width,
+void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
+                                    pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, int stride1,
-                                    pixel *pixuv2, int stride2, int width,
+void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
+                                    pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, int stride1,
-                                    pixel *pixuv2, int stride2, int width,
+void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
+                                    pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
-void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, int stride1,
-                                      const uint8_t *pix2, int stride2, int sums[2][4] );
-void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, int stride1,
-                                      const pixel *pix2, int stride2, int sums[2][4] );
-void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, int stride1,
-                                      const pixel *pix2, int stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1,
+                                      const uint8_t *pix2, intptr_t stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1,
+                                      const pixel *pix2, intptr_t stride2, int sums[2][4] );
+void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
+                                      const pixel *pix2, intptr_t stride2, int sums[2][4] );
 float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
-float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
-int  x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
-int  x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
-int  x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
-int  x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
-int  x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * );
-int  x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * );
-int  x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * );
-int  x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
-int  x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
-int  x264_pixel_vsad_sse2( pixel *src, int stride, int height );
-int  x264_pixel_vsad_ssse3( pixel *src, int stride, int height );
-int  x264_pixel_vsad_xop( pixel *src, int stride, int height );
+float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
+int  x264_pixel_var2_8x8_mmx2  ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
+int  x264_pixel_var2_8x8_sse2  ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
+int  x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_var2_8x8_xop   ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_var2_8x16_mmx2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
+int  x264_pixel_var2_8x16_sse2 ( pixel *,   intptr_t, pixel *,   intptr_t, int * );
+int  x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_var2_8x16_xop  ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int  x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
+int  x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
+int  x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
+int  x264_pixel_vsad_xop  ( pixel *src, intptr_t stride, int height );
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 744b836d..883f0018 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -759,6 +759,7 @@ OPTIMIZE_CHROMA_2x2_DC
 %macro DENOISE_DCT 0
 cglobal denoise_dct, 4,4,8
     pxor      m6, m6
+    movsxdifnidn r3, r3d
 .loop:
     mova      m2, [r0+r3*4-2*mmsize]
     mova      m3, [r0+r3*4-1*mmsize]
@@ -804,6 +805,7 @@ DENOISE_DCT
 %macro DENOISE_DCT 0
 cglobal denoise_dct, 4,4,7
     pxor      m6, m6
+    movsxdifnidn r3, r3d
 .loop:
     mova      m2, [r0+r3*2-2*mmsize]
     mova      m3, [r0+r3*2-1*mmsize]
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 58f4273e..50ad2d72 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -80,7 +80,7 @@ cextern sw_64
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SAD 2
 cglobal pixel_sad_%1x%2_mmx2, 4,4
@@ -116,7 +116,7 @@ SAD  4,  4
 
 %macro SAD_W16 0
 ;-----------------------------------------------------------------------------
-; int pixel_sad_16x16( uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_16x16, 4,4,8
     movu    m0, [r2]
@@ -183,7 +183,7 @@ cglobal pixel_sad_16x16, 4,4,8
     SAD_END_SSE2
 
 ;-----------------------------------------------------------------------------
-; int pixel_sad_16x8( uint8_t *, int, uint8_t *, int )
+; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 cglobal pixel_sad_16x8, 4,4
     movu    m0, [r2]
@@ -257,7 +257,7 @@ cglobal pixel_sad_8x16_sse2, 4,4
     RET
 
 ;-----------------------------------------------------------------------------
-; void pixel_vsad( pixel *src, int stride );
+; void pixel_vsad( pixel *src, intptr_t stride );
 ;-----------------------------------------------------------------------------
 
 %if ARCH_X86_64 == 0
@@ -867,14 +867,10 @@ INTRA_SAD16
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-;                          uint8_t *pix2, int i_stride, int scores[3] )
+;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
 ;-----------------------------------------------------------------------------
 %macro SAD_X 3
 cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
-%if WIN64
-    %assign i %1+1
-    movsxd r %+ i, r %+ i %+ d
-%endif
     SAD_X%1_2x%2P 1
 %rep %3/2-1
     SAD_X%1_2x%2P 0
@@ -1190,14 +1186,10 @@ SAD_X 4,  4,  4
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sad_x3_16x16( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
-;                          uint8_t *pix2, int i_stride, int scores[3] )
+;                          uint8_t *pix2, intptr_t i_stride, int scores[3] )
 ;-----------------------------------------------------------------------------
 %macro SAD_X_SSE2 3
 cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
-%if WIN64
-    %assign i %1+1
-    movsxd r %+ i, r %+ i %+ d
-%endif
     SAD_X%1_2x%2P_SSE2 1
 %rep %3/2-1
     SAD_X%1_2x%2P_SSE2 0
@@ -1485,9 +1477,6 @@ cglobal pixel_sad_x4_%1x%2_cache%3_%6
 %if ARCH_X86_64
     PROLOGUE 6,9
     mov  r8,  r6mp
-%if WIN64
-    movsxd r5, r5d
-%endif
     push r4
     push r3
     push r2
diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm
index 39f72598..273d0a01 100644
--- a/common/x86/sad16-a.asm
+++ b/common/x86/sad16-a.asm
@@ -87,7 +87,7 @@ cextern pw_8
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SAD_MMX 3
 cglobal pixel_sad_%1x%2, 4,4
@@ -152,7 +152,7 @@ SAD_MMX  4,  4, 2
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, int, uint16_t *, int )
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
 ;-----------------------------------------------------------------------------
 %macro SAD_XMM 2
 cglobal pixel_sad_%1x%2, 4,4,8
@@ -402,15 +402,12 @@ PIXEL_VSAD
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sad_xK_MxN( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
-;                        uint16_t *pix2, int i_stride, int scores[3] )
+;                        uint16_t *pix2, intptr_t i_stride, int scores[3] )
 ;-----------------------------------------------------------------------------
 %macro SAD_X 3
 cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
     %assign regnum %1+1
     %xdefine STRIDE r %+ regnum
-%if WIN64
-    movsxd STRIDE, STRIDE %+ d
-%endif
     mov     r6, %3/2-1
     SAD_X%1_ONE_START
     SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index f9be2cf0..4b2229ec 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -52,11 +52,6 @@
     %define mangle(x) x
 %endif
 
-; FIXME: All of the 64bit asm functions that take a stride as an argument
-; via register, assume that the high dword of that register is filled with 0.
-; This is true in practice (since we never do any 64bit arithmetic on strides,
-; and x264's strides are all positive), but is not guaranteed by the ABI.
-
 ; Name of the .rodata section.
 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
 ; so use a different read-only section.
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 51691c93..e4778771 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1987,7 +1987,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
     ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
     ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
     pixel *src0, *src1;
-    int stride0 = 16, stride1 = 16;
+    intptr_t stride0 = 16, stride1 = 16;
     int i_ref, i_mvc;
     ALIGNED_4( int16_t mvc[9][2] );
     int try_skip = a->b_try_skip;
@@ -2304,7 +2304,7 @@ static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t
         int y8 = i>>1;
         int i_part_cost;
         int i_part_cost_bi;
-        int stride[2] = {8,8};
+        intptr_t stride[2] = {8,8};
         pixel *src[2];
         x264_me_t m;
         m.i_pixel = PIXEL_8x8;
@@ -2393,7 +2393,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
         int y8 = i>>1;
         int i_part_cost;
         int i_part_cost_bi = 0;
-        int stride[2] = {8,8};
+        intptr_t stride[2] = {8,8};
         pixel *src[2];
 
         for( int l = 0; l < 2; l++ )
@@ -2464,7 +2464,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i
     {
         int i_part_cost;
         int i_part_cost_bi = 0;
-        int stride[2] = {16,16};
+        intptr_t stride[2] = {16,16};
         pixel *src[2];
         x264_me_t m;
         m.i_pixel = PIXEL_16x8;
@@ -2558,7 +2558,7 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i
     {
         int i_part_cost;
         int i_part_cost_bi = 0;
-        int stride[2] = {8,8};
+        intptr_t stride[2] = {8,8};
         pixel *src[2];
         x264_me_t m;
         m.i_pixel = PIXEL_8x16;
diff --git a/encoder/me.c b/encoder/me.c
index ccc7ad40..7b11e01d 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -70,7 +70,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 
 #define COST_MV_HPEL( mx, my ) \
 { \
-    int stride2 = 16; \
+    intptr_t stride2 = 16; \
     pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
     int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
@@ -775,7 +775,7 @@ void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh
 
 #define COST_MV_SAD( mx, my ) \
 { \
-    int stride = 16; \
+    intptr_t stride = 16; \
     pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
     int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
@@ -785,7 +785,7 @@ void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh
 #define COST_MV_SATD( mx, my, dir ) \
 if( b_refine_qpel || (dir^1) != odir ) \
 { \
-    int stride = 16; \
+    intptr_t stride = 16; \
     pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
     int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
@@ -854,7 +854,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     {
         int omx = bmx, omy = bmy;
         int costs[4];
-        int stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
+        intptr_t stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
         pixel *src0, *src1, *src2, *src3;
         src0 = h->mc.get_ref( pix,    &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
         src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
@@ -988,7 +988,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
     int ref1 = h->mb.cache.ref[1][s8];
     const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
     const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
-    int stride[3][2][9];
+    intptr_t stride[3][2][9];
     int bm0x = m0->mv[0];
     int bm0y = m0->mv[1];
     int bm1x = m1->mv[0];
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index c693b3f4..65ea761c 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -505,7 +505,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
         } \
         else \
         { \
-            int stride1 = 16, stride2 = 16; \
+            intptr_t stride1 = 16, stride2 = 16; \
             pixel *src1, *src2; \
             src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
                                   (mv0)[0], (mv0)[1], 8, 8, w ); \
diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm
index a0b85fac..47a4f65e 100644
--- a/tools/checkasm-a.asm
+++ b/tools/checkasm-a.asm
@@ -4,6 +4,7 @@
 ;* Copyright (C) 2008-2012 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;*          Henrik Gramner <hengar-6@student.ltu.se>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -29,7 +30,7 @@ SECTION_RODATA
 
 error_message: db "failed to preserve register", 0
 
-%if WIN64
+%if ARCH_X86_64
 ; just random numbers to reduce the chance of incidental match
 ALIGN 16
 x6:  ddq 0x79445c159ce790641a1b2550a612b48c
@@ -60,64 +61,107 @@ cextern_naked puts
 ; (max_args % 4) must equal 3 for stack alignment
 %define max_args 15
 
+%if ARCH_X86_64
+
+;-----------------------------------------------------------------------------
+; void x264_checkasm_stack_clobber( uint64_t clobber, ... )
+;-----------------------------------------------------------------------------
+cglobal checkasm_stack_clobber, 1,2
+    ; Clobber the stack with junk below the stack pointer
+    %define size (max_args+6)*8
+    SUB  rsp, size
+    mov   r1, size-8
+.loop:
+    mov [rsp+r1], r0
+    sub   r1, 8
+    jge .loop
+    ADD  rsp, size
+    RET
+
 %if WIN64
+    %assign free_regs 7
+%else
+    %assign free_regs 9
+%endif
 
 ;-----------------------------------------------------------------------------
 ; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
 ;-----------------------------------------------------------------------------
 INIT_XMM
-cglobal checkasm_call, 4,15,16
-    SUB  rsp, max_args*8
+cglobal checkasm_call, 2,15,16
+    SUB  rsp, max_args*8+16
     mov  r6, r0
-    mov  [rsp+stack_offset+16], r1
-    mov  r0, r2
-    mov  r1, r3
-    mov r2d, r4m ; FIXME truncates pointer
-    mov r3d, r5m ; FIXME truncates pointer
-%assign i 4
-%rep max_args-4
-    mov  r4, [rsp+stack_offset+8+(i+2)*8]
-    mov  [rsp+i*8], r4
-    %assign i i+1
-%endrep
-%assign i 6
-%rep 16-6
-    mova m %+ i, [x %+ i]
-    %assign i i+1
-%endrep
-%assign i 7
-%rep 15-7
+    mov  [rsp+max_args*8], r1
+
+    ; All arguments have been pushed on the stack instead of registers in order to
+    ; test for incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
+    mov  r0, r6mp
+    mov  r1, r7mp
+    mov  r2, r8mp
+    mov  r3, r9mp
+%if UNIX64
+    mov  r4, r10mp
+    mov  r5, r11mp
+    %assign i 6
+    %rep max_args-6
+        mov  r9, [rsp+stack_offset+(i+1)*8]
+        mov  [rsp+(i-6)*8], r9
+        %assign i i+1
+    %endrep
+%else
+    %assign i 4
+    %rep max_args-4
+        mov  r9, [rsp+stack_offset+(i+7)*8]
+        mov  [rsp+i*8], r9
+        %assign i i+1
+    %endrep
+%endif
+
+%if WIN64
+    %assign i 6
+    %rep 16-6
+        mova m %+ i, [x %+ i]
+        %assign i i+1
+    %endrep
+%endif
+
+%assign i 14
+%rep 15-free_regs
     mov  r %+ i, [n %+ i]
-    %assign i i+1
+    %assign i i-1
 %endrep
     call r6
-%assign i 7
-%rep 15-7
+%assign i 14
+%rep 15-free_regs
     xor  r %+ i, [n %+ i]
-    or   r7, r %+ i
-    %assign i i+1
-%endrep
-%assign i 6
-%rep 16-6
-    pxor m %+ i, [x %+ i]
-    por  m6, m %+ i
-    %assign i i+1
+    or  r14, r %+ i
+    %assign i i-1
 %endrep
+
+%if WIN64
+    %assign i 6
+    %rep 16-6
+        pxor m %+ i, [x %+ i]
+        por  m6, m %+ i
+        %assign i i+1
+    %endrep
     packsswb m6, m6
     movq r5, m6
-    or   r7, r5
+    or  r14, r5
+%endif
+
     jz .ok
-    mov  r4, rax
+    mov  r9, rax
     lea  r0, [error_message]
     call puts
-    mov  r1, [rsp+stack_offset+16]
+    mov  r1, [rsp+max_args*8]
     mov  dword [r1], 0
-    mov  rax, r4
+    mov  rax, r9
 .ok:
-    ADD  rsp, max_args*8
+    ADD  rsp, max_args*8+16
     RET
 
-%elif ARCH_X86_64 == 0
+%else
 
 ; just random numbers to reduce the chance of incidental match
 %define n3 dword 0x6549315c
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 144d28fa..b9b6b8ae 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -196,17 +196,33 @@ static void print_bench(void)
 
 #if ARCH_X86 || ARCH_X86_64
 int x264_stack_pagealign( int (*func)(), int align );
+
+/* detect when callee-saved regs aren't saved
+ * needs an explicit asm check because it only sometimes crashes in normal use. */
+intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
 #else
 #define x264_stack_pagealign( func, align ) func()
 #endif
 
 #define call_c1(func,...) func(__VA_ARGS__)
 
-#if ARCH_X86 || defined(_WIN64)
-/* detect when callee-saved regs aren't saved.
- * needs an explicit asm check because it only sometimes crashes in normal use. */
-intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
-#define call_a1(func,...) x264_checkasm_call((intptr_t(*)())func, &ok, __VA_ARGS__)
+#if ARCH_X86_64
+/* Evil hack: detect incorrect assumptions that 32-bit ints are zero-extended to 64-bit.
+ * This is done by clobbering the stack with junk around the stack pointer and calling the
+ * assembly function through x264_checkasm_call with added dummy arguments which forces all
+ * real arguments to be passed on the stack and not in registers. For 32-bit argument the
+ * upper half of the 64-bit register location on the stack will now contain junk. Note that
+ * this is dependant on compiler behaviour and that interrupts etc. at the wrong time may
+ * overwrite the junk written to the stack so there's no guarantee that it will always
+ * detect all functions that assumes zero-extension.
+ */
+void x264_checkasm_stack_clobber( uint64_t clobber, ... );
+#define call_a1(func,...) ({ \
+    uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
+    x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
+    x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
+#elif ARCH_X86
+#define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ )
 #else
 #define call_a1 call_c1
 #endif
@@ -291,8 +307,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             used_asm = 1; \
             for( int j = 0; j < 64; j++ ) \
             { \
-                res_c   = call_c( pixel_c.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
-                res_asm = call_a( pixel_asm.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
+                res_c   = call_c( pixel_c.name[i],   pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
+                res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
                 if( res_c != res_asm ) \
                 { \
                     ok = 0; \
@@ -332,16 +348,16 @@ static int check_pixel( int cpu_ref, int cpu_new )
             for( int j = 0; j < 64; j++ ) \
             { \
                 pixel *pix2 = pbuf2+j; \
-                res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \
+                res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2,   64 ); \
                 res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \
                 res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \
                 if( N == 4 ) \
                 { \
                     res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \
-                    call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
+                    call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
                 } \
                 else \
-                    call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
+                    call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
                 if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
                 { \
                     ok = 0; \
@@ -350,9 +366,9 @@ static int check_pixel( int cpu_ref, int cpu_new )
                              res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
                 } \
                 if( N == 4 ) \
-                    call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
+                    call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, (intptr_t)64, res_asm ); \
                 else \
-                    call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
+                    call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
             } \
         } \
     } \
@@ -367,8 +383,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
         set_func_name( "%s_%s", "var", pixel_names[i] ); \
         used_asm = 1; \
         /* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
-        call_c1( pixel_c.var[i], pbuf1, 16 ); \
-        call_a1( pixel_asm.var[i], pbuf1, 16 ); \
+        call_c1( pixel_c.var[i],   pbuf1,           16 ); \
+        call_a1( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
         uint64_t res_c   = pixel_c.var[i]( pbuf1, 16 ); \
         uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \
         if( res_c != res_asm ) \
@@ -376,8 +392,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             ok = 0; \
             fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
         } \
-        call_c2( pixel_c.var[i], pbuf1, 16 ); \
-        call_a2( pixel_asm.var[i], pbuf1, 16 ); \
+        call_c2( pixel_c.var[i],   pbuf1, (intptr_t)16 ); \
+        call_a2( pixel_asm.var[i], pbuf1, (intptr_t)16 ); \
     }
 
     ok = 1; used_asm = 0;
@@ -392,8 +408,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
         int res_c, res_asm, ssd_c, ssd_asm; \
         set_func_name( "%s_%s", "var2", pixel_names[i] ); \
         used_asm = 1; \
-        res_c   = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \
-        res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \
+        res_c   = call_c( pixel_c.var2[i],   pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c   ); \
+        res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \
         if( res_c != res_asm || ssd_c != ssd_asm ) \
         { \
             ok = 0; \
@@ -415,8 +431,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             for( int j = 0; j < 32; j++ )
             {
                 pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256;
-                call_c1( pixel_c.hadamard_ac[i], pbuf1, 16 );
-                call_a1( pixel_asm.hadamard_ac[i], pbuf1, 16 );
+                call_c1( pixel_c.hadamard_ac[i],   pbuf1, (intptr_t)16 );
+                call_a1( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
                 uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
                 uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
                 if( rc != ra )
@@ -426,8 +442,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
                     break;
                 }
             }
-            call_c2( pixel_c.hadamard_ac[i], pbuf1, 16 );
-            call_a2( pixel_asm.hadamard_ac[i], pbuf1, 16 );
+            call_c2( pixel_c.hadamard_ac[i],   pbuf1, (intptr_t)16 );
+            call_a2( pixel_asm.hadamard_ac[i], pbuf1, (intptr_t)16 );
         }
     report( "pixel hadamard_ac :" );
 
@@ -446,8 +462,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             for( int j = 0; j < 2 && ok; j++ )
             {
                 pixel *p = j ? pbuf4 : pbuf1;
-                res_c   = call_c( pixel_c.vsad,   p, 16, h );
-                res_asm = call_a( pixel_asm.vsad, p, 16, h );
+                res_c   = call_c( pixel_c.vsad,   p, (intptr_t)16, h );
+                res_asm = call_a( pixel_asm.vsad, p, (intptr_t)16, h );
                 if( res_c != res_asm )
                 {
                     ok = 0;
@@ -627,8 +643,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             fprintf( stderr, "ssd_nv12: %"PRIu64",%"PRIu64" != %"PRIu64",%"PRIu64"\n",
                      res_u_c, res_v_c, res_u_a, res_v_a );
         }
-        call_c( pixel_c.ssd_nv12_core,   pbuf1, 368, pbuf2, 368, 360, 8, &res_u_c, &res_v_c );
-        call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8, &res_u_a, &res_v_a );
+        call_c( pixel_c.ssd_nv12_core,   pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_c, &res_v_c );
+        call_a( pixel_asm.ssd_nv12_core, pbuf1, (intptr_t)368, pbuf2, (intptr_t)368, 360, 8, &res_u_a, &res_v_a );
     }
     report( "ssd_nv12 :" );
 
@@ -648,8 +664,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
             fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
         }
         set_func_name( "ssim_core" );
-        call_c2( pixel_c.ssim_4x4x2_core,   pbuf1+2, 32, pbuf2+2, 32, sums );
-        call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums );
+        call_c2( pixel_c.ssim_4x4x2_core,   pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
+        call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, (intptr_t)32, pbuf2+2, (intptr_t)32, sums );
         set_func_name( "ssim_end" );
         call_c2( pixel_c.ssim_end4,   sums, sums, 4 );
         call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
@@ -1054,8 +1070,8 @@ static int check_mc( int cpu_ref, int cpu_new )
             used_asm = 1; \
             for( int i = 0; i < 1024; i++ ) \
                 pbuf3[i] = pbuf4[i] = 0xCD; \
-            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
-            call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
+            call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
+            call_a( mc_a.mc_luma, dst2, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
             if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
@@ -1065,15 +1081,15 @@ static int check_mc( int cpu_ref, int cpu_new )
         if( mc_a.get_ref != mc_ref.get_ref ) \
         { \
             pixel *ref = dst2; \
-            int ref_stride = 32; \
+            intptr_t ref_stride = 32; \
             int w_checked = ( ( sizeof(pixel) == 2 && (w == 12 || w == 20)) ? w-2 : w ); \
             const x264_weight_t *weight = x264_weight_none; \
             set_func_name( "get_ref_%dx%d", w_checked, h ); \
             used_asm = 1; \
             for( int i = 0; i < 1024; i++ ) \
                 pbuf3[i] = pbuf4[i] = 0xCD; \
-            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
-            ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
+            call_c( mc_c.mc_luma, dst1, (intptr_t)32, src2, (intptr_t)64, dx, dy, w, h, weight ); \
+            ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, (intptr_t)64, dx, dy, w, h, weight ); \
             for( int i = 0; i < h; i++ ) \
                 if( memcmp( dst1+i*32, ref+i*ref_stride, w_checked * sizeof(pixel) ) ) \
                 { \
@@ -1090,14 +1106,14 @@ static int check_mc( int cpu_ref, int cpu_new )
             used_asm = 1; \
             for( int i = 0; i < 1024; i++ ) \
                 pbuf3[i] = pbuf4[i] = 0xCD; \
-            call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \
-            call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \
+            call_c( mc_c.mc_chroma, dst1, dst1+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
+            call_a( mc_a.mc_chroma, dst2, dst2+8, (intptr_t)16, src, (intptr_t)64, dx, dy, w, h ); \
             /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
             for( int j = 0; j < h; j++ ) \
                 for( int i = w; i < 8; i++ ) \
                 { \
                     dst2[i+j*16+8] = dst1[i+j*16+8]; \
-                    dst2[i+j*16] = dst1[i+j*16]; \
+                    dst2[i+j*16  ] = dst1[i+j*16  ]; \
                 } \
             if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
@@ -1149,15 +1165,15 @@ static int check_mc( int cpu_ref, int cpu_new )
         { \
             set_func_name( "%s_%s", #name, pixel_names[i] ); \
             used_asm = 1; \
-            call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
-            call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
+            call_c1( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
+            call_a1( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
             if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
             } \
-            call_c2( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
-            call_a2( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
+            call_c2( mc_c.name[i], pbuf3, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
+            call_a2( mc_a.name[i], pbuf4, (intptr_t)16, pbuf2+1, (intptr_t)16, pbuf1+18, (intptr_t)16, weight ); \
         } \
     } \
 }
@@ -1185,9 +1201,9 @@ static int check_mc( int cpu_ref, int cpu_new )
         { \
             set_func_name( "%s_w%d", #name, j ); \
             used_asm = 1; \
-            call_c1( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
+            call_c1( mc_c.weight[i],     buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
             mc_a.weight_cache(&ha, &weight); \
-            call_a1( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
+            call_a1( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
             for( int k = 0; k < 16; k++ ) \
                 if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \
                 { \
@@ -1195,8 +1211,8 @@ static int check_mc( int cpu_ref, int cpu_new )
                     fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
                     break; \
                 } \
-            call_c2( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
-            call_a2( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
+            call_c2( mc_c.weight[i],     buffC, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
+            call_a2( weight.weightfn[i], buffA, (intptr_t)32, pbuf2+align_off, (intptr_t)32, &weight, 16 ); \
         } \
     }
 
@@ -1248,8 +1264,8 @@ static int check_mc( int cpu_ref, int cpu_new )
             used_asm = 1;
             memset( pbuf3, 0, 64*height );
             memset( pbuf4, 0, 64*height );
-            call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height );
-            call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height );
+            call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height );
+            call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height );
             if( memcmp( pbuf3, pbuf4, 64*height ) )
             {
                 ok = 0;
@@ -1261,8 +1277,8 @@ static int check_mc( int cpu_ref, int cpu_new )
         {
             set_func_name( "load_deinterleave_chroma_fenc" );
             used_asm = 1;
-            call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height );
-            call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height );
+            call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, (intptr_t)64, height );
+            call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, (intptr_t)64, height );
             if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
             {
                 ok = 0;
@@ -1274,8 +1290,8 @@ static int check_mc( int cpu_ref, int cpu_new )
         {
             set_func_name( "load_deinterleave_chroma_fdec" );
             used_asm = 1;
-            call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height );
-            call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height );
+            call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, (intptr_t)64, height );
+            call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, (intptr_t)64, height );
             if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
             {
                 ok = 0;
@@ -1298,8 +1314,8 @@ static int check_mc( int cpu_ref, int cpu_new )
         {
             int w = plane_specs[i].w;
             int h = plane_specs[i].h;
-            int src_stride = plane_specs[i].src_stride;
-            int dst_stride = (w + 127) & ~63;
+            intptr_t src_stride = plane_specs[i].src_stride;
+            intptr_t dst_stride = (w + 127) & ~63;
             assert( dst_stride * h <= 0x1000 );
             pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
             memset( pbuf3, 0, 0x1000*sizeof(pixel) );
@@ -1310,7 +1326,7 @@ static int check_mc( int cpu_ref, int cpu_new )
                 if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) )
                 {
                     ok = 0;
-                    fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+                    fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
                     break;
                 }
         }
@@ -1324,8 +1340,8 @@ static int check_mc( int cpu_ref, int cpu_new )
         {
             int w = (plane_specs[i].w + 1) >> 1;
             int h = plane_specs[i].h;
-            int src_stride = (plane_specs[i].src_stride + 1) >> 1;
-            int dst_stride = (2*w + 127) & ~63;
+            intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
+            intptr_t dst_stride = (2*w + 127) & ~63;
             assert( dst_stride * h <= 0x1000 );
             pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
             memset( pbuf3, 0, 0x1000*sizeof(pixel) );
@@ -1336,7 +1352,7 @@ static int check_mc( int cpu_ref, int cpu_new )
                 if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
                 {
                     ok = 0;
-                    fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+                    fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
                     break;
                 }
         }
@@ -1350,9 +1366,9 @@ static int check_mc( int cpu_ref, int cpu_new )
         {
             int w = (plane_specs[i].w + 1) >> 1;
             int h = plane_specs[i].h;
-            int dst_stride = w;
-            int src_stride = (2*w + 127) & ~63;
-            int offv = (dst_stride*h + 31) & ~15;
+            intptr_t dst_stride = w;
+            intptr_t src_stride = (2*w + 127) & ~63;
+            intptr_t offv = (dst_stride*h + 31) & ~15;
             memset( pbuf3, 0, 0x1000 );
             memset( pbuf4, 0, 0x1000 );
             call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );
@@ -1362,7 +1378,7 @@ static int check_mc( int cpu_ref, int cpu_new )
                     memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) )
                 {
                     ok = 0;
-                    fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+                    fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
                     break;
                 }
         }
@@ -1379,8 +1395,8 @@ static int check_mc( int cpu_ref, int cpu_new )
         ok = 1; used_asm = 1;
         memset( pbuf3, 0, 4096 * sizeof(pixel) );
         memset( pbuf4, 0, 4096 * sizeof(pixel) );
-        call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp );
-        call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp );
+        call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, (intptr_t)64, 48, 10, tmp );
+        call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, (intptr_t)64, 48, 10, tmp );
         for( int i = 0; i < 3; i++ )
             for( int j = 0; j < 10; j++ )
                 //FIXME ideally the first pixels would match too, but they aren't actually used
@@ -1407,9 +1423,9 @@ static int check_mc( int cpu_ref, int cpu_new )
         ok = 1; used_asm = 1;
         for( int w = 40; w <= 48; w += 8 )
         {
-            int stride = (w+8)&~15;
-            call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
-            call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+            intptr_t stride = (w+8)&~15;
+            call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], (intptr_t)w*2, stride, w, 16 );
+            call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], (intptr_t)w*2, stride, w, 16 );
             for( int i = 0; i < 16; i++ )
             {
                 for( int j = 0; j < 4; j++ )
@@ -1433,7 +1449,7 @@ static int check_mc( int cpu_ref, int cpu_new )
 #define INTEGRAL_INIT( name, size, ... )\
     if( mc_a.name != mc_ref.name )\
     {\
-        int stride = 80;\
+        intptr_t stride = 80;\
         set_func_name( #name );\
         used_asm = 1;\
         memcpy( buf3, buf1, size*2*stride );\
@@ -1496,7 +1512,7 @@ static int check_mc( int cpu_ref, int cpu_new )
     {
         set_func_name( "memcpy_aligned" );
         ok = 1; used_asm = 1;
-        for( int size = 16; size < 256; size += 16 )
+        for( size_t size = 16; size < 256; size += 16 )
         {
             memset( buf4, 0xAA, size + 1 );
             call_c( mc_c.memcpy_aligned, buf3, buf1, size );
@@ -1504,7 +1520,7 @@ static int check_mc( int cpu_ref, int cpu_new )
             if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
             {
                 ok = 0;
-                fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size );
+                fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
                 break;
             }
         }
@@ -1515,7 +1531,7 @@ static int check_mc( int cpu_ref, int cpu_new )
     {
         set_func_name( "memzero_aligned" );
         ok = 1; used_asm = 1;
-        for( int size = 128; size < 1024; size += 128 )
+        for( size_t size = 128; size < 1024; size += 128 )
         {
             memset( buf4, 0xAA, size + 1 );
             call_c( mc_c.memzero_aligned, buf3, size );
@@ -1523,7 +1539,7 @@ static int check_mc( int cpu_ref, int cpu_new )
             if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
             {
                 ok = 0;
-                fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size );
+                fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
                 break;
             }
         }
@@ -1561,7 +1577,7 @@ static int check_deblock( int cpu_ref, int cpu_new )
 #define TEST_DEBLOCK( name, align, ... ) \
     for( int i = 0; i < 36; i++ ) \
     { \
-        int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
+        intptr_t off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
         for( int j = 0; j < 1024; j++ ) \
             /* two distributions of random to excersize different failure modes */ \
             pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
@@ -1570,16 +1586,16 @@ static int check_deblock( int cpu_ref, int cpu_new )
         { \
             set_func_name( #name ); \
             used_asm = 1; \
-            call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-            call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+            call_c1( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+            call_a1( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
             if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
                 break; \
             } \
-            call_c2( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-            call_a2( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+            call_c2( db_c.name, pbuf3+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+            call_a2( db_a.name, pbuf4+off, (intptr_t)32, alphas[i], betas[i], ##__VA_ARGS__ ); \
         } \
     }
 
@@ -1935,11 +1951,11 @@ static int check_quant( int cpu_ref, int cpu_new )
             memcpy( dct1, buf1, size*sizeof(dctcoef) );
             memcpy( dct2, buf1, size*sizeof(dctcoef) );
             memcpy( buf3+256, buf3, 256 );
-            call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
+            call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3,       (udctcoef*)buf2, size );
             call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
             if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
                 ok = 0;
-            call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (udctcoef*)buf2, size );
+            call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3,       (udctcoef*)buf2, size );
             call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (udctcoef*)buf2, size );
         }
     }