cacheline split workaround for mc_luma

author Fiona Glaser <fiona@x264.com>

Wed, 9 Apr 2008 22:30:34 +0000 (16:30 -0600)

committer Loren Merritt <pengvado@akuvian.org>

Thu, 17 Apr 2008 00:09:48 +0000 (18:09 -0600)
author Fiona Glaser <fiona@x264.com>
Wed, 9 Apr 2008 22:30:34 +0000 (16:30 -0600)
committer Loren Merritt <pengvado@akuvian.org>
Thu, 17 Apr 2008 00:09:48 +0000 (18:09 -0600)
diff --git a/common/frame.c b/common/frame.c

index 4fcf64895296761da70952890093402b577ca903..a4c33f036fcaedde3f21df95a8c4fcaaeffbbb54 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -31,6 +31,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
      int i_mb_count = h->mb.i_mb_count;
      int i_stride, i_width, i_lines;
      int i_padv = PADV << h->param.b_interlaced;
+    int luma_plane_size;
  
      if( !frame ) return NULL;
  
@@ -55,20 +56,20 @@ x264_frame_t *x264_frame_new( x264_t *h )
          frame->i_stride[i] = i_stride >> !!i;
          frame->i_width[i] = i_width >> !!i;
          frame->i_lines[i] = i_lines >> !!i;
-        CHECKED_MALLOC( frame->buffer[i],
-                        frame->i_stride[i] * (i_lines + 2*i_padv) >> !!i );
-        frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
-                          ((frame->i_stride[i] * i_padv + PADH) >> !!i);
      }
  
-    frame->filtered[0] = frame->plane[0];
-    for( i = 0; i < 3; i++ )
+    luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
+    for( i = 1; i < 3; i++ )
      {
-        CHECKED_MALLOC( frame->buffer[4+i],
-                        frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ) );
-        frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) +
-                                frame->i_stride[0] * i_padv + PADH;
+        CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
+        frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
      }
+    /* all 4 luma planes allocated together, since the cacheline split code
+     * requires them to be in-phase wrt cacheline alignment. */
+    CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
+    for( i = 0; i < 4; i++ )
+        frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+    frame->plane[0] = frame->filtered[0];
  
      if( h->frames.b_have_lowres )
      {
@@ -86,9 +87,9 @@ x264_frame_t *x264_frame_new( x264_t *h )
  
      if( h->param.analyse.i_me_method >= X264_ME_ESA )
      {
-        CHECKED_MALLOC( frame->buffer[7],
+        CHECKED_MALLOC( frame->buffer[3],
                          2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
-        frame->integral = (uint16_t*)frame->buffer[7] + frame->i_stride[0] * i_padv + PADH;
+        frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
      }
  
      frame->i_poc = -1;
@@ -132,7 +133,7 @@ fail:
  void x264_frame_delete( x264_frame_t *frame )
  {
      int i, j;
-    for( i = 0; i < 8; i++ )
+    for( i = 0; i < 4; i++ )
          x264_free( frame->buffer[i] );
      for( i = 0; i < 4; i++ )
          x264_free( frame->buffer_lowres[i] );
diff --git a/common/frame.h b/common/frame.h

index 6240c8462d2b07f02ea9af60340eca439c583d2d..6c8f991d8cea9eb0d87cff74f0f8709550083ad7 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -56,7 +56,7 @@ typedef struct
  
      /* for unrestricted mv we allocate more data than needed
       * allocated data are stored in buffer */
-    void    *buffer[8];
+    void    *buffer[4];
      void    *buffer_lowres[4];
  
      /* motion data */
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm

index b262e391212a64044bc4287310d61455add5405d..ed1e3326f4222686a75ca524d4efaf517212c330 100644 (file)
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -30,6 +30,7 @@ pw_4:  times 4 dw  4
  pw_8:  times 4 dw  8
  pw_32: times 4 dw 32
  pw_64: times 4 dw 64
+sw_64: dd 64
  
  SECTION .text
  
@@ -229,7 +230,8 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7
      jg     .height_loop
      REP_RET
  
-cglobal x264_pixel_avg2_w16_sse2, 6,7
+%macro PIXEL_AVG_SSE 1
+cglobal x264_pixel_avg2_w16_%1, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
@@ -247,7 +249,7 @@ cglobal x264_pixel_avg2_w16_sse2, 6,7
      jg     .height_loop
      REP_RET
  
-cglobal x264_pixel_avg2_w20_sse2, 6,7
+cglobal x264_pixel_avg2_w20_%1, 6,7
      sub    r4, r2
      lea    r6, [r4+r3]
  .height_loop:
@@ -270,8 +272,123 @@ cglobal x264_pixel_avg2_w20_sse2, 6,7
      sub    r5d, 2
      jg     .height_loop
      REP_RET
+%endmacro
+
+PIXEL_AVG_SSE sse2
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+PIXEL_AVG_SSE sse3
+%undef movdqu
+%endif
+
+; Cacheline split code for processors with high latencies for loads
+; split over cache lines.  See sad-a.asm for a more detailed explanation.
+; This particular instance is complicated by the fact that src1 and src2
+; can have different alignments.  For simplicity and code size, only the
+; MMX cacheline workaround is used.  As a result, in the case of SSE2
+; pixel_avg, the cacheline check functions calls the SSE2 version if there
+; is no cacheline split, and the MMX workaround if there is.
+
+%macro INIT_SHIFT 2
+    and    eax, 7
+    shl    eax, 3
+%ifdef PIC32
+    ; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
+    mov    r2, 64
+    sub    r2, eax
+    movd   %2, eax
+    movd   %1, r2
+%else
+    movd   %1, [sw_64 GLOBAL]
+    movd   %2, eax
+    psubw  %1, %2
+%endif
+%endmacro
+
+%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
+cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0
+    mov    eax, r2m
+    and    eax, 0x1f|(%2>>1)
+    cmp    eax, (32-%1)|(%2>>1)
+    jle x264_pixel_avg2_w%1_%3
+;w12 isn't needed because w16 is just as fast if there's no cacheline split
+%if %1 == 12
+    jmp x264_pixel_avg2_w16_cache_mmxext
+%else
+    jmp x264_pixel_avg2_w%1_cache_mmxext
+%endif
+%endmacro
+
+%macro AVG_CACHELINE_START 0
+    %assign stack_offset 0
+    INIT_SHIFT mm6, mm7
+    mov    eax, r4m
+    INIT_SHIFT mm4, mm5
+    PROLOGUE 6,6,0
+    and    r2, ~7
+    and    r4, ~7
+    sub    r4, r2
+.height_loop:
+%endmacro
  
+%macro AVG_CACHELINE_LOOP 2
+    movq   mm0, [r2+8+%1]
+    movq   mm1, [r2+%1]
+    movq   mm2, [r2+r4+8+%1]
+    movq   mm3, [r2+r4+%1]
+    psllq  mm0, mm6
+    psrlq  mm1, mm7
+    psllq  mm2, mm4
+    psrlq  mm3, mm5
+    por    mm0, mm1
+    por    mm2, mm3
+    pavgb  mm0, mm2
+    %2 [r0+%1], mm0
+%endmacro
  
+x264_pixel_avg2_w8_cache_mmxext:
+    AVG_CACHELINE_START
+    AVG_CACHELINE_LOOP 0, movq
+    add    r2, r3
+    add    r0, r1
+    dec    r5d
+    jg     .height_loop
+    RET
+
+x264_pixel_avg2_w16_cache_mmxext:
+    AVG_CACHELINE_START
+    AVG_CACHELINE_LOOP 0, movq
+    AVG_CACHELINE_LOOP 8, movq
+    add    r2, r3
+    add    r0, r1
+    dec    r5d
+    jg .height_loop
+    RET
+
+x264_pixel_avg2_w20_cache_mmxext:
+    AVG_CACHELINE_START
+    AVG_CACHELINE_LOOP 0, movq
+    AVG_CACHELINE_LOOP 8, movq
+    AVG_CACHELINE_LOOP 16, movd
+    add    r2, r3
+    add    r0, r1
+    dec    r5d
+    jg .height_loop
+    RET
+
+%ifndef ARCH_X86_64
+AVG_CACHELINE_CHECK  8, 32, mmxext
+AVG_CACHELINE_CHECK 12, 32, mmxext
+AVG_CACHELINE_CHECK 16, 32, mmxext
+AVG_CACHELINE_CHECK 20, 32, mmxext
+AVG_CACHELINE_CHECK 16, 64, mmxext
+AVG_CACHELINE_CHECK 20, 64, mmxext
+%endif
+
+AVG_CACHELINE_CHECK  8, 64, mmxext
+AVG_CACHELINE_CHECK 12, 64, mmxext
+AVG_CACHELINE_CHECK 16, 64, sse2
+AVG_CACHELINE_CHECK 20, 64, sse2
  
  ;=============================================================================
  ; pixel copy
@@ -362,6 +479,11 @@ cglobal %1, 5,7
  %endmacro
  
  COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
+; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
+; but with SSE3 the overhead is zero, so there's no reason not to include it.
+%ifdef HAVE_SSE3
+COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
+%endif
  COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
  
  
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index c2c2904d6f31a475602d3a26f85c17eceaf8c245..fd202da45119e2ef0c925b2daafdde0c1d031ea1 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -38,17 +38,11 @@ extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int );
  extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int );
  extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int );
  extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
-extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
  extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
  extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
  extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
  extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
@@ -62,6 +56,19 @@ extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h
  extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
  extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
  
+#define PIXEL_AVG_W(width,cpu)\
+extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+/* This declares some functions that don't exist, but that isn't a problem. */
+#define PIXEL_AVG_WALL(cpu)\
+PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu);
+
+PIXEL_AVG_WALL(mmxext)
+PIXEL_AVG_WALL(cache32_mmxext)
+PIXEL_AVG_WALL(cache64_mmxext)
+PIXEL_AVG_WALL(cache64_sse2)
+PIXEL_AVG_WALL(sse2)
+PIXEL_AVG_WALL(sse3)
+
  #define AVG_WEIGHT(W,H) \
  void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
  { \
@@ -73,40 +80,48 @@ AVG_WEIGHT(8,16)
  AVG_WEIGHT(8,8)
  AVG_WEIGHT(8,4)
  
-static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
-{
-    NULL,
-    x264_pixel_avg2_w4_mmxext,
-    x264_pixel_avg2_w8_mmxext,
-    x264_pixel_avg2_w12_mmxext,
-    x264_pixel_avg2_w16_mmxext,
-    x264_pixel_avg2_w20_mmxext,
-};
-static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =
-{
-    NULL,
-    x264_mc_copy_w4_mmx,
-    x264_mc_copy_w8_mmx,
-    NULL,
-    x264_mc_copy_w16_mmx
-};
-static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
-{
-    NULL,
-    x264_pixel_avg2_w4_mmxext,
-    x264_pixel_avg2_w8_mmxext,
-    x264_pixel_avg2_w12_mmxext,
-    x264_pixel_avg2_w16_sse2,
-    x264_pixel_avg2_w20_sse2,
+#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
+static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
+{\
+    NULL,\
+    x264_pixel_avg2_w4_##name1,\
+    x264_pixel_avg2_w8_##name2,\
+    x264_pixel_avg2_w12_##name3,\
+    x264_pixel_avg2_w16_##name4,\
+    x264_pixel_avg2_w20_##name5,\
  };
-static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) =
-{
-    NULL,
-    x264_mc_copy_w4_mmx,
-    x264_mc_copy_w8_mmx,
-    NULL,
-    x264_mc_copy_w16_sse2,
+
+/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
+#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
+#define x264_pixel_avg2_w12_sse3         x264_pixel_avg2_w16_sse3
+
+PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
+#ifdef ARCH_X86
+PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
+#endif
+PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
+PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
+PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
+#ifdef HAVE_SSE3
+PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
+#endif
+
+#define MC_COPY_WTAB(instr, name1, name2, name3)\
+static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
+{\
+    NULL,\
+    x264_mc_copy_w4_##name1,\
+    x264_mc_copy_w8_##name2,\
+    NULL,\
+    x264_mc_copy_w16_##name3,\
  };
+
+MC_COPY_WTAB(mmx,mmx,mmx,mmx)
+MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+#ifdef HAVE_SSE3
+MC_COPY_WTAB(sse3,mmx,mmx,sse3)
+#endif
+
  static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
  static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
  
@@ -134,7 +149,15 @@ void mc_luma_##name( uint8_t *dst,    int i_dst_stride,\
  }
  
  MC_LUMA(mmxext,mmxext,mmx)
+#ifdef ARCH_X86
+MC_LUMA(cache32_mmxext,cache32_mmxext,mmx)
+MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
+#endif
  MC_LUMA(sse2,sse2,sse2)
+MC_LUMA(cache64_sse2,cache64_sse2,sse2)
+#ifdef HAVE_SSE3
+MC_LUMA(cache64_sse3,cache64_sse3,sse3)
+#endif
  
  #define GET_REF(name)\
  uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
@@ -161,7 +184,15 @@ uint8_t *get_ref_##name( uint8_t *dst,   int *i_dst_stride,\
  }
  
  GET_REF(mmxext)
+#ifdef ARCH_X86
+GET_REF(cache32_mmxext)
+GET_REF(cache64_mmxext)
+#endif
  GET_REF(sse2)
+GET_REF(cache64_sse2)
+#ifdef HAVE_SSE3
+GET_REF(cache64_sse3)
+#endif
  
  #define HPEL(align, cpu, cpuv, cpuc, cpuh)\
  void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
@@ -240,6 +271,19 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
      pf->prefetch_ref  = x264_prefetch_ref_mmxext;
  
+#ifdef ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
+    if( cpu&X264_CPU_CACHELINE_32 )
+    {
+        pf->mc_luma = mc_luma_cache32_mmxext;
+        pf->get_ref = get_ref_cache32_mmxext;
+    }
+    else if( cpu&X264_CPU_CACHELINE_SPLIT )
+    {
+        pf->mc_luma = mc_luma_cache64_mmxext;
+        pf->get_ref = get_ref_cache64_mmxext;
+    }
+#endif
+
      if( !(cpu&X264_CPU_SSE2) )
          return;
  
@@ -257,6 +301,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
      pf->hpel_filter = x264_hpel_filter_sse2;
  
+    if( cpu&X264_CPU_CACHELINE_SPLIT )
+    {
+        pf->mc_luma = mc_luma_cache64_sse2;
+        pf->get_ref = get_ref_cache64_sse2;
+#ifdef HAVE_SSE3
+        /* lddqu doesn't work on Core2 */
+        if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
+        {
+            pf->mc_luma = mc_luma_cache64_sse3;
+            pf->get_ref = get_ref_cache64_sse3;
+        }
+#endif
+    }
+
      if( !(cpu&X264_CPU_SSSE3) )
          return;
  
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm

index c178e8e1c81339ba6b79554594a80f01a5741e54..6e31921c5a1c7fc5f61ddf7107a56412f31e472a 100644 (file)
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -670,7 +670,7 @@ sad_w16_align%1_sse2:
      lea     r2,   [r2+2*r3]
      dec     r4
      jg sad_w16_align%1_sse2
-    rep ret
+    ret
  %endmacro
  
  ; computed jump assumes this loop is exactly 64 bytes
@@ -689,7 +689,7 @@ sad_w16_align%1_ssse3:
      lea     r2,   [r2+2*r3]
      dec     r4
      jg sad_w16_align%1_ssse3
-    rep ret
+    ret
  %endmacro
  
  %macro SAD16_CACHELINE_FUNC 2 ; cpu, height
diff --git a/tools/checkasm.c b/tools/checkasm.c

index d301a8ba04530d8b333af1ae0ab1f1bcaba61628..05c25bdbc2eca46c217c9ee6db4bb27f692cfd18 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -476,8 +476,9 @@ static int check_mc( int cpu_ref, int cpu_new )
          }
      ok = 1; used_asm = 0;
      for( dy = -8; dy < 8; dy++ )
-        for( dx = -8; dx < 8; dx++ )
+        for( dx = -128; dx < 128; dx++ )
          {
+            if( rand()&15 ) continue; // running all of them is too slow
              MC_TEST_LUMA( 20, 18 );
              MC_TEST_LUMA( 16, 16 );
              MC_TEST_LUMA( 16, 8 );
author	Fiona Glaser <fiona@x264.com>
	Wed, 9 Apr 2008 22:30:34 +0000 (16:30 -0600)
committer	Loren Merritt <pengvado@akuvian.org>
	Thu, 17 Apr 2008 00:09:48 +0000 (18:09 -0600)
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/x86/mc-a.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
common/x86/sad-a.asm		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history