MMX/SSE2 high bit depth interleave functions

author Daniel Kang <daniel.d.kang@gmail.com>

Thu, 16 Dec 2010 12:41:17 +0000 (04:41 -0800)

committer Fiona Glaser <fiona@x264.com>

Mon, 10 Jan 2011 19:38:52 +0000 (11:38 -0800)
author Daniel Kang <daniel.d.kang@gmail.com>
Thu, 16 Dec 2010 12:41:17 +0000 (04:41 -0800)
committer Fiona Glaser <fiona@x264.com>
Mon, 10 Jan 2011 19:38:52 +0000 (11:38 -0800)
diff --git a/common/frame.c b/common/frame.c

index d7e48728e43f572e866e621af3392bcf4667ba40..87e196851d3afbe97072579e0479b21e4052852c 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -300,7 +300,8 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
          get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
          get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
          h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
-                                     pix[1], stride[1], pix[2], stride[2],
+                                     (pixel*)pix[1], stride[1]/sizeof(pixel),
+                                     (pixel*)pix[2], stride[2]/sizeof(pixel),
                                       h->param.i_width>>1, h->param.i_height>>1 );
      }
      return 0;
diff --git a/common/mc.c b/common/mc.c

index 3632266378d703473b6598b62d5d24e5c1904040..96cc650864465ef1a82c0d82600335a53cd25310 100644 (file)
--- a/common/mc.c
+++ b/common/mc.c
@@ -291,14 +291,14 @@ void x264_plane_copy_c( pixel *dst, int i_dst,
  }
  
  void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
-                                   uint8_t *srcu, int i_srcu,
-                                   uint8_t *srcv, int i_srcv, int w, int h )
+                                   pixel *srcu, int i_srcu,
+                                   pixel *srcv, int i_srcv, int w, int h )
  {
      for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
          for( int x=0; x<w; x++ )
          {
-            dst[2*x]   = ((pixel*)srcu)[x];
-            dst[2*x+1] = ((pixel*)srcv)[x];
+            dst[2*x]   = srcu[x];
+            dst[2*x+1] = srcv[x];
          }
  }
  
diff --git a/common/mc.h b/common/mc.h

index df16355c618ec8ccb05b927412d1c76c633878bf..3667fdf16594a8fd7ca64a1cd08592ec90252eca 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -90,8 +90,8 @@ typedef struct
      void (*plane_copy)( pixel *dst, int i_dst,
                          uint8_t *src, int i_src, int w, int h );
      void (*plane_copy_interleave)( pixel *dst, int i_dst,
-                                   uint8_t *srcu, int i_srcu,
-                                   uint8_t *srcv, int i_srcv, int w, int h );
+                                   pixel *srcu, int i_srcu,
+                                   pixel *srcv, int i_srcv, int w, int h );
      /* may write up to 15 pixels off the end of each plane */
      void (*plane_copy_deinterleave)( pixel *dstu, int i_dstu,
                                       pixel *dstv, int i_dstv,
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index 68da4b1ecb2381b41d45c0ba1f42da3637132e40..2824f26b20db622d529afda48f8d1e65ce05b8b3 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -881,56 +881,21 @@ cglobal plane_copy_core_mmxext, 6,7
      emms
      RET
  
-%ifdef HIGH_BIT_DEPTH
  
  %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
-%if mmsize==16
-    mov%4       m0, [%2]
-    mov%4       m1, [%3]
-    SBUTTERFLY  wd, 0, 1, 2
-    mov%5a [%1+ 0], m0
-    mov%5a [%1+16], m1
+%ifdef HIGH_BIT_DEPTH
+%assign x 0
+%rep 16/mmsize
+    mov%4     m0, [%2+(x/2)*mmsize]
+    mov%4     m1, [%3+(x/2)*mmsize]
+    mova      m2, m0
+    punpcklwd m0, m1
+    punpckhwd m2, m1
+    mov%5a    [%1+(x+0)*mmsize], m0
+    mov%5a    [%1+(x+1)*mmsize], m2
+    %assign x (x+2)
+%endrep
  %else
-    movq        m0, [%2+0]
-    movq        m1, [%3+0]
-    SBUTTERFLY  wd, 0, 1, 2
-    mov%5q [%1+ 0], m0
-    mov%5q [%1+ 8], m1
-    movq        m0, [%2+8]
-    movq        m1, [%3+8]
-    SBUTTERFLY  wd, 0, 1, 2
-    mov%5q [%1+16], m0
-    mov%5q [%1+24], m1
-%endif
-%endmacro
-
-%macro PLANE_INTERLEAVE 1
-;-----------------------------------------------------------------------------
-; void store_interleave_8x8x2( uint16_t *dst, int i_dst, uint16_t *srcu, uint16_t *srcv )
-;-----------------------------------------------------------------------------
-cglobal store_interleave_8x8x2_%1, 4,5
-    mov    r4d, 16
-    FIX_STRIDES r1
-.loop:
-    INTERLEAVE r0, r2, r3, a
-    add    r2, FDEC_STRIDEB
-    add    r3, FDEC_STRIDEB
-    add    r0, r1
-    dec    r4d
-    jg .loop
-    REP_RET
-
-%endmacro ; PLANE_INTERLEAVE
-
-INIT_MMX
-PLANE_INTERLEAVE mmxext
-INIT_XMM
-PLANE_INTERLEAVE sse2
-
-%endif ; HIGH_BIT_DEPTH
-
-%ifndef HIGH_BIT_DEPTH
-%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
      movq   m0, [%2]
  %if mmsize==16
  %ifidn %4, a
@@ -945,11 +910,11 @@ PLANE_INTERLEAVE sse2
      mova   m2, m0
      punpcklbw m0, m1
      punpckhbw m2, m1
-    mov%5a [%1], m0
+    mov%5a [%1+0], m0
      mov%5a [%1+8], m2
  %endif
+%endif ; HIGH_BIT_DEPTH
  %endmacro
-%endif
  
  %macro DEINTERLEAVE 7 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant, is aligned
  %ifdef HIGH_BIT_DEPTH
@@ -1003,7 +968,6 @@ PLANE_INTERLEAVE sse2
  %endif ; HIGH_BIT_DEPTH
  %endmacro
  
-%ifndef HIGH_BIT_DEPTH
  %macro PLANE_INTERLEAVE 1
  ;-----------------------------------------------------------------------------
  ; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
@@ -1011,11 +975,17 @@ PLANE_INTERLEAVE sse2
  ;                                  uint8_t *srcv, int i_srcv, int w, int h )
  ;-----------------------------------------------------------------------------
  ; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core_%1, 6,7
-    mov    r6d, r6m
+cglobal plane_copy_interleave_core_%1, 7,7
+    FIX_STRIDES r1d, r3d, r5d, r6d
+%ifdef HIGH_BIT_DEPTH
+    mov   r1m, r1d
+    mov   r3m, r3d
+    mov   r6m, r6d
+%endif
      movsxdifnidn r1, r1d
      movsxdifnidn r3, r3d
      movsxdifnidn r5, r5d
+    movsxdifnidn r6, r6d
      lea    r0, [r0+r6*2]
      add    r2,  r6
      add    r4,  r6
@@ -1024,10 +994,10 @@ cglobal plane_copy_interleave_core_%1, 6,7
  %else
      DECLARE_REG_TMP 1,3
  %endif
+    mov  t1, r1
+    shr  t1, SIZEOF_PIXEL
+    sub  t1, r6
      mov  t0d, r7m
-    mov  t1d, r1d
-    shr  t1d, 1
-    sub  t1d, r6d
  .loopy:
      mov    r6d, r6m
      neg    r6
@@ -1039,21 +1009,25 @@ cglobal plane_copy_interleave_core_%1, 6,7
      mov    r6d, r6m
      neg    r6
  .loopx:
-    INTERLEAVE r0+r6*2,    r2+r6,   r4+r6,   u, nt
-    INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, u, nt
-    add    r6, 16
+    INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
+    INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
+    add    r6, 16*SIZEOF_PIXEL
      jl .loopx
  .pad:
+%assign n 0
+%rep SIZEOF_PIXEL
  %if mmsize==8
-    movntq [r0+r6*2], m0
-    movntq [r0+r6*2+8], m0
-    movntq [r0+r6*2+16], m0
-    movntq [r0+r6*2+24], m0
+    movntq [r0+r6*2+(n+ 0)], m0
+    movntq [r0+r6*2+(n+ 8)], m0
+    movntq [r0+r6*2+(n+16)], m0
+    movntq [r0+r6*2+(n+24)], m0
  %else
-    movntdq [r0+r6*2], m0
-    movntdq [r0+r6*2+16], m0
+    movntdq [r0+r6*2+(n+ 0)], m0
+    movntdq [r0+r6*2+(n+16)], m0
  %endif
-    add    r6, 16
+    %assign n n+32
+%endrep
+    add    r6, 16*SIZEOF_PIXEL
      cmp    r6, t1
      jl .pad
      add    r0, r1mp
@@ -1070,17 +1044,17 @@ cglobal plane_copy_interleave_core_%1, 6,7
  ;-----------------------------------------------------------------------------
  cglobal store_interleave_8x8x2_%1, 4,5
      mov    r4d, 4
+    FIX_STRIDES r1d
  .loop:
-    INTERLEAVE r0, r2, r3, a
-    INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, a
-    add    r2, FDEC_STRIDE*2
-    add    r3, FDEC_STRIDE*2
+    INTERLEAVE r0+ 0, r2+           0, r3+           0, a
+    INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
+    add    r2, FDEC_STRIDEB*2
+    add    r3, FDEC_STRIDEB*2
      lea    r0, [r0+r1*2]
      dec    r4d
      jg .loop
      REP_RET
  %endmacro ; PLANE_INTERLEAVE
-%endif ; !HIGH_BIT_DEPTH
  
  %macro DEINTERLEAVE_START 1
  %ifdef HIGH_BIT_DEPTH
@@ -1161,8 +1135,10 @@ cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
  
  %ifdef HIGH_BIT_DEPTH
  INIT_MMX
+PLANE_INTERLEAVE mmxext
  PLANE_DEINTERLEAVE mmx
  INIT_XMM
+PLANE_INTERLEAVE sse2
  PLANE_DEINTERLEAVE sse2
  %else
  INIT_MMX
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index 2d0a0b0e711aacaacfe58a1fae31145c7ff0dfaa..b71f8337b77ea38cfa611fcd7049193a895a4081 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -86,15 +86,15 @@ void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
  void x264_prefetch_ref_mmxext( uint8_t *, int, int );
  void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
  void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h );
-void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst,
-                                             uint8_t *srcu, int i_srcu,
-                                             uint8_t *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_core_sse2( uint8_t *dst, int i_dst,
-                                           uint8_t *srcu, int i_srcu,
-                                           uint8_t *srcv, int i_srcv, int w, int h );
-void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst,
-                                   uint8_t *srcu, int i_srcu,
-                                   uint8_t *srcv, int i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_mmxext( pixel *dst, int i_dst,
+                                             pixel *srcu, int i_srcu,
+                                             pixel *srcv, int i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
+                                           pixel *srcu, int i_srcu,
+                                           pixel *srcv, int i_srcv, int w, int h );
+void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
+                                   pixel *srcu, int i_srcu,
+                                   pixel *srcv, int i_srcv, int w, int h );
  void x264_plane_copy_deinterleave_mmx( pixel *dstu, int i_dstu,
                                         pixel *dstv, int i_dstv,
                                         pixel *src, int i_src, int w, int h );
@@ -442,11 +442,12 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i
          x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 );
      }
  }
+#endif // HIGH_BIT_DEPTH
  
  #define PLANE_INTERLEAVE(cpu) \
-static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\
-                                              uint8_t *srcu, int i_srcu,\
-                                              uint8_t *srcv, int i_srcv, int w, int h )\
+static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
+                                              pixel *srcu, int i_srcu,\
+                                              pixel *srcv, int i_srcv, int w, int h )\
  {\
      if( !(w&15) ) {\
          x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
@@ -463,7 +464,6 @@ static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\
  
  PLANE_INTERLEAVE(mmxext)
  PLANE_INTERLEAVE(sse2)
-#endif // HIGH_BIT_DEPTH
  
  void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
  {
@@ -488,6 +488,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      if( !(cpu&X264_CPU_MMXEXT) )
          return;
  
+    pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
+
      pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
      pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmxext;
      pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmxext;
@@ -513,6 +515,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
      pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
  
+    pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
      pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
  
      if( cpu&X264_CPU_SSE2_IS_FAST )
@@ -560,7 +563,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
      pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx;
  
      pf->plane_copy = x264_plane_copy_mmxext;
-    pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
      pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
  
      pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
author	Daniel Kang <daniel.d.kang@gmail.com>
	Thu, 16 Dec 2010 12:41:17 +0000 (04:41 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Mon, 10 Jan 2011 19:38:52 +0000 (11:38 -0800)
common/frame.c		patch \| blob \| history
common/mc.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history