From b7d27eaab35a6fdffc66ffff51bd287b0f67bb3e Mon Sep 17 00:00:00 2001
From: Fiona Glaser <fiona@x264.com>
Date: Wed, 17 Sep 2008 21:25:05 -0700
Subject: [PATCH] Rewrite avg/avg_weight to take two source pointers This
 allows the use of get_ref instead of mc_luma almost everywhere for bipred

---
 common/macroblock.c | 64 ++++++++++++++++++++-------------
 common/mc.c         | 25 +++++++------
 common/mc.h         |  4 +--
 common/x86/mc-a.asm | 81 +++++++++++++++++++++--------------------
 common/x86/mc-c.c   | 47 ++++++++++++------------
 encoder/analyse.c   | 88 +++++++++++++++++++--------------------------
 encoder/me.c        | 16 +++++----
 encoder/slicetype.c | 14 ++++----
 tools/checkasm.c    | 14 ++++----
 9 files changed, 182 insertions(+), 171 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 883b515e..39f50aa8 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -554,47 +554,63 @@ static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int hei
 static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
 {
     const int i8 = x264_scan8[0]+x+8*y;
-
+    const int i_ref0 = h->mb.cache.ref[0][i8];
     const int i_ref1 = h->mb.cache.ref[1][i8];
+    const int mvx0   = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
     const int mvx1   = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
+    int       mvy0   = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
     int       mvy1   = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
-    DECLARE_ALIGNED_16( uint8_t tmp[16*16] );
-    int i_mode = x264_size2pixel[height][width];
-
-    x264_mb_mc_0xywh( h, x, y, width, height );
-
-    h->mc.mc_luma( tmp, 16, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
-                   mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
-
+    int       i_mode = x264_size2pixel[height][width];
+    int       i_stride0 = 16, i_stride1 = 16;
+    DECLARE_ALIGNED_16( uint8_t tmp0[16*16] );
+    DECLARE_ALIGNED_16( uint8_t tmp1[16*16] );
+    uint8_t *src0, *src1;
+
+    src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
+                          mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
+    src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
+                          mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
+
+    if( h->mb.b_interlaced & i_ref0 )
+        mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
     if( h->mb.b_interlaced & i_ref1 )
         mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
 
+    h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                     &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+                     mvx0, mvy0, 2*width, 2*height );
+
+    h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                     &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+                     mvx0, mvy0, 2*width, 2*height );
+
     if( h->param.analyse.b_weighted_bipred )
     {
-        const int i_ref0 = h->mb.cache.ref[0][i8];
         const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
 
-        h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16, weight );
-
-        h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+        h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
+                                  src0, i_stride0, src1, i_stride1, weight );
+        h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
                          mvx1, mvy1, 2*width, 2*height );
-        h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
-
-        h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+        h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                    &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
+        h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
                          mvx1, mvy1, 2*width, 2*height );
-        h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
+        h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                                    &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
     }
     else
     {
-        h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16 );
-
-        h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
+        h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
+                           src0, i_stride0, src1, i_stride1 );
+        h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
                          mvx1, mvy1, 2*width, 2*height );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
-
-        h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
+        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                             &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
+        h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
                          mvx1, mvy1, 2*width, 2*height );
-        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
+        h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+                             &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
     }
 }
 
diff --git a/common/mc.c b/common/mc.c
index 2be45cc7..9fc7a343 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -49,25 +49,27 @@ static inline void pixel_avg( uint8_t *dst,  int i_dst_stride,
     }
 }
 
-static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height )
+static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height )
 {
     int x, y;
     for( y = 0; y < height; y++ )
     {
         for( x = 0; x < width; x++ )
         {
-            dst[x] = ( dst[x] + src[x] + 1 ) >> 1;
+            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
         }
+        src1 += i_src1;
+        src2 += i_src2;
         dst += i_dst;
-        src += i_src;
     }
 }
 
 #define PIXEL_AVG_C( name, width, height ) \
 static void name( uint8_t *pix1, int i_stride_pix1, \
-                  uint8_t *pix2, int i_stride_pix2 ) \
+                  uint8_t *pix2, int i_stride_pix2, \
+                  uint8_t *pix3, int i_stride_pix3 ) \
 { \
-    pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
+    pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
 }
 PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
 PIXEL_AVG_C( pixel_avg_16x8,  16, 8 )
@@ -83,11 +85,13 @@ PIXEL_AVG_C( pixel_avg_2x2,   2, 2 )
 
 /* Implicit weighted bipred only:
  * assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
-#define op_scale2(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight1 + src[x]*i_weight2 + (1<<5)) >> 6 )
-static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height, int i_weight1 ){
+#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
+static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height, int i_weight1 )
+{
     int y;
     const int i_weight2 = 64 - i_weight1;
-    for(y=0; y<height; y++, dst += i_dst, src += i_src){
+    for( y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
+    {
         op_scale2(0);
         op_scale2(1);
         if(width==2) continue;
@@ -113,9 +117,10 @@ static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src,
 #define PIXEL_AVG_WEIGHT_C( width, height ) \
 static void pixel_avg_weight_##width##x##height( \
                 uint8_t *pix1, int i_stride_pix1, \
-                uint8_t *pix2, int i_stride_pix2, int i_weight1 ) \
+                uint8_t *pix2, int i_stride_pix2, \
+                uint8_t *pix3, int i_stride_pix3, int i_weight1 ) \
 { \
-    pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height, i_weight1 ); \
+    pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, i_weight1 ); \
 }
 
 PIXEL_AVG_WEIGHT_C(16,16)
diff --git a/common/mc.h b/common/mc.h
index af87f99c..21106f2e 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -45,8 +45,8 @@ typedef struct
                       int mvx, int mvy,
                       int i_width, int i_height );
 
-    void (*avg[10])( uint8_t *dst, int, uint8_t *src, int );
-    void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src, int, int i_weight );
+    void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int );
+    void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
 
     /* only 16x16, 8x8, and 4x4 defined */
     void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height );
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 169c3982..4d5ac5f6 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -41,19 +41,23 @@ SECTION .text
 
 ;-----------------------------------------------------------------------------
 ; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
-;                                 uint8_t *src, int src_stride );
+;                                 uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride );
 ;-----------------------------------------------------------------------------
 %macro AVGH 3
 %assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space
-cglobal x264_pixel_avg_%1x%2_%3
+cglobal x264_pixel_avg_%1x%2_%3,0,0
     mov eax, %2
+%ifidn %3, sse2
+    test dword r4m, 15
+    jnz x264_pixel_avg_w%1_mmxext
+%endif
     jmp x264_pixel_avg_w%1_%3
 %assign function_align 16
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
-;                                uint8_t *src, int src_stride,
+;                                uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
 ;                                int height );
 ;-----------------------------------------------------------------------------
 %ifdef ARCH_X86_64
@@ -61,8 +65,10 @@ cglobal x264_pixel_avg_%1x%2_%3
     %define t1 r1
     %define t2 r2
     %define t3 r3
+    %define t4 r4
+    %define t5 r5
     %macro AVG_START 1
-        cglobal %1, 4,5
+        cglobal %1, 6,7
         .height_loop:
     %endmacro
 %else
@@ -70,18 +76,23 @@ cglobal x264_pixel_avg_%1x%2_%3
     %define t1 r2
     %define t2 r3
     %define t3 r4
+    %define t4 r5
+    %define t5 r6
     %macro AVG_START 1
-        cglobal %1, 0,5
+        cglobal %1, 0,7
         mov t0, r0m
         mov t1, r1m
         mov t2, r2m
         mov t3, r3m
+        mov t4, r4m
+        mov t5, r5m
         .height_loop:
     %endmacro
 %endif
 
 %macro AVG_END 0
     sub    eax, 2
+    lea    t4, [t4+t5*2]
     lea    t2, [t2+t3*2]
     lea    t0, [t0+t1*2]
     jg     .height_loop
@@ -91,8 +102,8 @@ cglobal x264_pixel_avg_%1x%2_%3
 AVG_START x264_pixel_avg_w4_mmxext
     movd   mm0, [t2]
     movd   mm1, [t2+t3]
-    pavgb  mm0, [t0]
-    pavgb  mm1, [t0+t1]
+    pavgb  mm0, [t4]
+    pavgb  mm1, [t4+t5]
     movd   [t0], mm0
     movd   [t0+t1], mm1
 AVG_END
@@ -104,8 +115,8 @@ AVGH 4, 2, mmxext
 AVG_START x264_pixel_avg_w8_mmxext
     movq   mm0, [t2]
     movq   mm1, [t2+t3]
-    pavgb  mm0, [t0]
-    pavgb  mm1, [t0+t1]
+    pavgb  mm0, [t4]
+    pavgb  mm1, [t4+t5]
     movq   [t0], mm0
     movq   [t0+t1], mm1
 AVG_END
@@ -119,10 +130,10 @@ AVG_START x264_pixel_avg_w16_mmxext
     movq   mm1, [t2+8]
     movq   mm2, [t2+t3  ]
     movq   mm3, [t2+t3+8]
-    pavgb  mm0, [t0  ]
-    pavgb  mm1, [t0+8]
-    pavgb  mm2, [t0+t1  ]
-    pavgb  mm3, [t0+t1+8]
+    pavgb  mm0, [t4  ]
+    pavgb  mm1, [t4+8]
+    pavgb  mm2, [t4+t5  ]
+    pavgb  mm3, [t4+t5+8]
     movq   [t0  ], mm0
     movq   [t0+8], mm1
     movq   [t0+t1  ], mm2
@@ -135,8 +146,8 @@ AVGH 16, 8,  mmxext
 AVG_START x264_pixel_avg_w16_sse2
     movdqu xmm0, [t2]
     movdqu xmm1, [t2+t3]
-    pavgb  xmm0, [t0]
-    pavgb  xmm1, [t0+t1]
+    pavgb  xmm0, [t4]
+    pavgb  xmm1, [t4+t5]
     movdqa [t0], xmm0
     movdqa [t0+t1], xmm1
 AVG_END
@@ -480,9 +491,9 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
 %endif
 %endmacro
 
-%macro BIWEIGHT 2
-    movh      m0, %1
-    movh      m1, %2
+%macro BIWEIGHT 3
+    movh      m0, %2
+    movh      m1, %3
     punpcklbw m0, m7
     punpcklbw m1, m7
     pmullw    m0, m4
@@ -496,52 +507,46 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
 %endmacro
 
 %macro BIWEIGHT_START 1
-%ifidn r4m, r4d
-    movd    m4, r4m
+    movd    m4, r6m
     SPLATW  m4, m4   ; weight_dst
-%else
-    SPLATW  m4, r4m
-%endif
     mova    m5, [pw_64 GLOBAL]
     psubw   m5, m4      ; weight_src
     mova    m6, [pw_32 GLOBAL] ; rounding
     pxor    m7, m7
 %if %1
-%ifidn r5m, r5d
-    %define t0 r5d
-%else
-    %define t0 r4d
-    mov  r4d, r5m
-%endif
+    %define t0 r6d
+    mov  r6d, r7m
 %endif
 .height_loop:
 %endmacro
 
 INIT_MMX
 ;-----------------------------------------------------------------------------
-; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
+; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight, int )
 ;-----------------------------------------------------------------------------
-cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4
+cglobal x264_pixel_avg_weight_4x4_mmxext, 6,6
     BIWEIGHT_START 0
-    BIWEIGHT  [r0     ], [r2     ]
-    BIWEIGHT  [r0+r1  ], [r2+r3  ]
-    BIWEIGHT  [r0+r1*2], [r2+r3*2]
+    BIWEIGHT  [r0     ], [r2     ], [r4     ]
+    BIWEIGHT  [r0+r1  ], [r2+r3  ], [r4+r5  ]
+    BIWEIGHT  [r0+r1*2], [r2+r3*2], [r4+r5*2]
     add  r0, r1
     add  r2, r3
-    BIWEIGHT  [r0+r1*2], [r2+r3*2]
+    add  r4, r5
+    BIWEIGHT  [r0+r1*2], [r2+r3*2], [r4+r5*2]
     RET
 
 %macro AVG_WEIGHT 2
-cglobal x264_pixel_avg_weight_w%2_%1, 4,5
+cglobal x264_pixel_avg_weight_w%2_%1, 6,7
     BIWEIGHT_START 1
 %assign x 0
 %rep %2*2/mmsize
-    BIWEIGHT  [r0+x], [r2+x]
-    BIWEIGHT  [r0+x+r1], [r2+x+r3]
+    BIWEIGHT  [r0+x], [r2+x], [r4+x]
+    BIWEIGHT  [r0+x+r1], [r2+x+r3], [r4+x+r5]
 %assign x x+mmsize/2
 %endrep
     lea  r0, [r0+r1*2]
     lea  r2, [r2+r3*2]
+    lea  r4, [r4+r5*2]
     sub  t0, 2
     jg   .height_loop
     REP_RET
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 777c6b30..9cd2d2f7 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -30,27 +30,27 @@
 #include "mc.h"
 
 /* NASM functions */
-extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
 extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
 extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
-extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
-extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, int, int );
-extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
+extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
 extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
 extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
 extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
@@ -88,9 +88,9 @@ PIXEL_AVG_WALL(cache64_sse2)
 PIXEL_AVG_WALL(sse2)
 
 #define AVG_WEIGHT(W,H,name) \
-static void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
+static void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int i_weight_dst ) \
 { \
-    x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src, i_src, i_weight_dst, H ); \
+    x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src1, i_src1, src2, i_src2, i_weight_dst, H ); \
 }
 
 AVG_WEIGHT(16,16,mmxext)
@@ -311,14 +311,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
-    if( !(cpu&X264_CPU_STACK_MOD4) )
-    {
-        pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
-        pf->avg_weight[PIXEL_16x8]  = x264_pixel_avg_weight_16x8_sse2;
-        pf->avg_weight[PIXEL_8x16]  = x264_pixel_avg_weight_8x16_sse2;
-        pf->avg_weight[PIXEL_8x8]   = x264_pixel_avg_weight_8x8_sse2;
-        pf->avg_weight[PIXEL_8x4]   = x264_pixel_avg_weight_8x4_sse2;
-    }
+    pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
+    pf->avg_weight[PIXEL_16x8]  = x264_pixel_avg_weight_16x8_sse2;
+    pf->avg_weight[PIXEL_8x16]  = x264_pixel_avg_weight_8x16_sse2;
+    pf->avg_weight[PIXEL_8x8]   = x264_pixel_avg_weight_8x8_sse2;
+    pf->avg_weight[PIXEL_8x4]   = x264_pixel_avg_weight_8x4_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
     pf->mc_chroma = x264_mc_chroma_sse2;
diff --git a/encoder/analyse.c b/encoder/analyse.c
index ecbaf4ec..7b941c55 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -1474,21 +1474,21 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
     }
 }
 
-#define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
-    { \
-        if( h->param.analyse.b_weighted_bipred ) \
-            h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \
-                    h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
-        else \
-            h->mc.avg[size]( pix1, stride1, src2, stride2 ); \
-    }
+#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
+{ \
+    if( h->param.analyse.b_weighted_bipred ) \
+        h->mc.avg_weight[size]( pix, stride, src1, stride1, src2, stride2, \
+                h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
+    else \
+        h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2 ); \
+}
 
 static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 {
+    DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
     DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
-    DECLARE_ALIGNED_16( uint8_t pix2[16*16] );
-    uint8_t *src2;
-    int stride2 = 16;
+    uint8_t *src0, *src1;
+    int stride0 = 16, stride1 = 16;
     int weight;
 
     x264_me_t m;
@@ -1560,40 +1560,19 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
 
     /* get cost of BI mode */
     weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
-    if ( (*(uint32_t*)a->l0.me16x16.mv & 0x10001) == 0 )
-    {
-        /* l0 reference is halfpel, so get_ref on it will make it faster */
-        src2 =
-        h->mc.get_ref( pix2, &stride2,
-                       h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
-                       a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
-                       16, 16 );
-        h->mc.mc_luma( pix1, 16,
-                       h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
-                       a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
-                       16, 16 );
-        weight = 64 - weight;
-    }
-    else
-    {
-        /* if l0 was qpel, we'll use get_ref on l1 instead */
-        h->mc.mc_luma( pix1, 16,
-                       h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
-                       a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
-                       16, 16 );
-        src2 =
-        h->mc.get_ref( pix2, &stride2,
-                       h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
-                       a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
-                       16, 16 );
-    }
+    src0 = h->mc.get_ref( pix0, &stride0,
+                           h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
+                           a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
+    src1 = h->mc.get_ref( pix1, &stride1,
+                           h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
+                           a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
 
     if( h->param.analyse.b_weighted_bipred )
-        h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2, weight );
+        h->mc.avg_weight[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, weight );
     else
-        h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
+        h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1 );
 
-    a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 )
+    a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
                      + REF_COST( 0, a->l0.i_ref )
                      + REF_COST( 1, a->l1.i_ref )
                      + a->l0.me16x16.cost_mv
@@ -1709,6 +1688,8 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
         const int y8 = i/2;
         int i_part_cost;
         int i_part_cost_bi = 0;
+        int stride[2] = {8,8};
+        uint8_t *src[2];
 
         for( l = 0; l < 2; l++ )
         {
@@ -1727,13 +1708,12 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
             x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
 
             /* BI mode */
-            h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
-                           m->mv[0], m->mv[1], 8, 8 );
+            src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
+                                    m->mv[0], m->mv[1], 8, 8 );
             i_part_cost_bi += m->cost_mv;
             /* FIXME: ref cost */
         }
-
-        WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
+        WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, src[0], stride[0], src[1], stride[1] );
         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
                         + a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
         a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
@@ -1759,7 +1739,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
     uint8_t **p_fref[2] =
         { h->mb.pic.p_fref[0][a->l0.i_ref],
           h->mb.pic.p_fref[1][a->l1.i_ref] };
-    DECLARE_ALIGNED_16( uint8_t  pix[2][16*8] );
+    DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
     DECLARE_ALIGNED_4( int16_t mvc[2][2] );
     int i, l;
 
@@ -1770,6 +1750,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
     {
         int i_part_cost;
         int i_part_cost_bi = 0;
+        int stride[2] = {16,16};
+        uint8_t *src[2];
 
         /* TODO: check only the list(s) that were used in b8x8? */
         for( l = 0; l < 2; l++ )
@@ -1790,13 +1772,13 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
             x264_me_search( h, m, mvc, 2 );
 
             /* BI mode */
-            h->mc.mc_luma( pix[l], 16, m->p_fref, m->i_stride[0],
-                           m->mv[0], m->mv[1], 16, 8 );
+            src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
+                                    m->mv[0], m->mv[1], 16, 8 );
             /* FIXME: ref cost */
             i_part_cost_bi += m->cost_mv;
         }
 
-        WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
+        WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, src[0], stride[0], src[1], stride[1] );
         i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
 
         i_part_cost = a->l0.me16x8[i].cost;
@@ -1839,6 +1821,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
     {
         int i_part_cost;
         int i_part_cost_bi = 0;
+        int stride[2] = {8,8};
+        uint8_t *src[2];
 
         for( l = 0; l < 2; l++ )
         {
@@ -1858,13 +1842,13 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
             x264_me_search( h, m, mvc, 2 );
 
             /* BI mode */
-            h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
-                           m->mv[0], m->mv[1], 8, 16 );
+            src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref,  m->i_stride[0],
+                                    m->mv[0], m->mv[1], 8, 16 );
             /* FIXME: ref cost */
             i_part_cost_bi += m->cost_mv;
         }
 
-        WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, pix[1], 8 );
+        WEIGHTED_AVG( PIXEL_8x16, pix[0], 8, src[0], stride[0], src[1], stride[1] );
         i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x16]( a->l0.me8x16[i].p_fenc[0], FENC_STRIDE, pix[0], 8 );
 
         i_part_cost = a->l0.me8x16[i].cost;
diff --git a/encoder/me.c b/encoder/me.c
index 63c57863..8892e340 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -787,8 +787,10 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
 #define BIME_CACHE( dx, dy ) \
 { \
     int i = 4 + 3*dx + dy; \
-    h->mc.mc_luma( pix0[i], bw, m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
-    h->mc.mc_luma( pix1[i], bw, m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
+    stride0[i] = bw;\
+    stride1[i] = bw;\
+    src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], om0x+dx, om0y+dy, bw, bh ); \
+    src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], om1x+dx, om1y+dy, bw, bh ); \
 }
 
 #define BIME_CACHE2(a,b) \
@@ -802,11 +804,10 @@ if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \
     int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \
     int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \
     visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\
-    h->mc.memcpy_aligned( pix, pix0[i0], bs ); \
     if( i_weight == 32 ) \
-        h->mc.avg[i_pixel]( pix, bw, pix1[i1], bw ); \
+        h->mc.avg[i_pixel]( pix, bw, src0[i0], stride0[i0], src1[i1], stride1[i1] ); \
     else \
-        h->mc.avg_weight[i_pixel]( pix, bw, pix1[i1], bw, i_weight ); \
+        h->mc.avg_weight[i_pixel]( pix, bw, src1[i1], stride1[i1], src0[i0], stride0[i0], i_weight ); \
     cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, bw ) \
          + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \
          + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \
@@ -838,7 +839,6 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
     const int i_pixel = m0->i_pixel;
     const int bw = x264_pixel_size[i_pixel].w;
     const int bh = x264_pixel_size[i_pixel].h;
-    const int bs = bw*bh;
     const int16_t *p_cost_m0x = m0->p_cost_mv - x264_clip3( m0->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
     const int16_t *p_cost_m0y = m0->p_cost_mv - x264_clip3( m0->mvp[1], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
     const int16_t *p_cost_m1x = m1->p_cost_mv - x264_clip3( m1->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
@@ -846,6 +846,10 @@ int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight
     DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] );
     DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] );
     DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+    uint8_t *src0[9];
+    uint8_t *src1[9];
+    int stride0[9];
+    int stride1[9];
     int bm0x = m0->mv[0], om0x = bm0x;
     int bm0y = m0->mv[1], om0y = bm0y;
     int bm1x = m1->mv[0], om1x = bm1x;
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 04034f19..ed10698a 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -95,17 +95,17 @@ static int x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
     }
 #define TRY_BIDIR( mv0, mv1, penalty ) \
     { \
-        int stride2 = 16; \
-        uint8_t *src2; \
+        int stride1 = 16, stride2 = 16; \
+        uint8_t *src1, *src2; \
         int i_cost; \
-        h->mc.mc_luma( pix1, 16, m[0].p_fref, m[0].i_stride[0], \
-                       (mv0)[0], (mv0)[1], 8, 8 ); \
+        src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
+                              (mv0)[0], (mv0)[1], 8, 8 ); \
         src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
-                       (mv1)[0], (mv1)[1], 8, 8 ); \
+                              (mv1)[0], (mv1)[1], 8, 8 ); \
         if( i_bipred_weight != 32 ) \
-            h->mc.avg_weight[PIXEL_8x8]( pix1, 16, src2, stride2, i_bipred_weight ); \
+            h->mc.avg_weight[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
         else \
-            h->mc.avg[PIXEL_8x8]( pix1, 16, src2, stride2 ); \
+            h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2 ); \
         i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
                            m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
         if( i_bcost > i_cost ) \
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 82779bde..37f2b074 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -774,27 +774,27 @@ static int check_mc( int cpu_ref, int cpu_new )
 #define MC_TEST_AVG( name, ... ) \
     for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \
     { \
-        memcpy( buf3, buf1, 1024 ); \
-        memcpy( buf4, buf1, 1024 ); \
+        memcpy( buf2, buf1, 1024 ); \
+        memcpy( buf4, buf3, 1024 ); \
         if( mc_a.name[i] != mc_ref.name[i] ) \
         { \
             set_func_name( "%s_%s", #name, pixel_names[i] );\
             used_asm = 1; \
-            call_c1( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
-            call_a1( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
+            call_c1( mc_c.name[i], buf3, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
+            call_a1( mc_a.name[i], buf4, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
             if( memcmp( buf3, buf4, 1024 ) )               \
             { \
                 ok = 0; \
                 fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
             } \
-            call_c2( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
-            call_a2( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
+            call_c2( mc_c.name[i], buf3, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
+            call_a2( mc_a.name[i], buf4, 32, buf2+1, 16, buf1+18, 16, ##__VA_ARGS__ ); \
         } \
     }
     MC_TEST_AVG( avg );
     report( "mc avg :" );
     ok = 1; used_asm = 0;
-    for( w = -64; w <= 128 && ok; w++ )
+    for( w = 32; w <= 32 && ok; w++ )
         MC_TEST_AVG( avg_weight, w );
     report( "mc wpredb :" );
 
-- 
2.40.0