in hpel search, merge two 16x16 mc calls into one 16x17. 15% faster hpel, .3% overall.

author Loren Merritt <pengvado@videolan.org>

Mon, 2 Apr 2007 23:56:09 +0000 (23:56 +0000)

committer Loren Merritt <pengvado@videolan.org>

Mon, 2 Apr 2007 23:56:09 +0000 (23:56 +0000)
author Loren Merritt <pengvado@videolan.org>
Mon, 2 Apr 2007 23:56:09 +0000 (23:56 +0000)
committer Loren Merritt <pengvado@videolan.org>
Mon, 2 Apr 2007 23:56:09 +0000 (23:56 +0000)
diff --git a/common/amd64/amd64inc.asm b/common/amd64/amd64inc.asm

index 44422789da5d32c61735cdc185f520af2d310dca..e9409965c5e4a3d868f2a85b8a25a5cc7179ed2a 100644 (file)
--- a/common/amd64/amd64inc.asm
+++ b/common/amd64/amd64inc.asm
@@ -78,6 +78,11 @@ BITS 64
  %define parm7d dword parm7q
  %define parm8d dword parm8q
  
+%define temp1q rdi
+%define temp2q rsi
+%define temp1d edi
+%define temp2d esi
+
  %macro firstpush 1
      db 0x48
      push %1
@@ -234,6 +239,11 @@ SECTION .text
  %define parm7d dword parm7q
  %define parm8d dword parm8q
  
+%define temp1q r9
+%define temp2q r8
+%define temp1d r9d
+%define temp2d r8d
+
  %macro allocstack 1
  %endmacro
  
diff --git a/common/amd64/mc-a.asm b/common/amd64/mc-a.asm

index 8ae1416bcf30c6735885efbfd7938fb592460a63..7e0bfa27cf75c45392238a38ce5782e6d387aa8f 100644 (file)
--- a/common/amd64/mc-a.asm
+++ b/common/amd64/mc-a.asm
@@ -59,6 +59,7 @@ SECTION .text
  cglobal x264_pixel_avg_w4_mmxext
  cglobal x264_pixel_avg_w8_mmxext
  cglobal x264_pixel_avg_w16_mmxext
+cglobal x264_pixel_avg_w20_mmxext
  cglobal x264_pixel_avg_w16_sse2
  
  cglobal x264_pixel_avg_weight_4x4_mmxext
@@ -103,7 +104,7 @@ ALIGN 4
      lea         parm3q, [parm3q+parm4q*2]
      lea         r10, [r10+r11*2]
      lea         parm1q, [parm1q+parm2q*2]
-    jne         .height_loop
+    jg          .height_loop
      rep ret
  
                            
@@ -132,7 +133,7 @@ ALIGN 4
      lea         parm3q, [parm3q+parm4q*2]
      lea         r10, [r10+r11*2]
      lea         parm1q, [parm1q+parm2q*2]
-    jne         .height_loop
+    jg          .height_loop
      rep ret
  
  ALIGN 16
@@ -159,7 +160,37 @@ ALIGN 4
      lea         parm3q, [parm3q+parm4q]
      lea         r10, [r10+r11]
      lea         parm1q, [parm1q+parm2q]
-    jne         .height_loop
+    jg          .height_loop
+    rep ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w20_mmxext( uint8_t *dst,  int i_dst_stride,
+;                                 uint8_t *src1, int i_src1_stride,
+;                                 uint8_t *src2, int i_src2_stride,
+;                                 int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w20_mmxext:
+    mov         r10, parm5q         ; src2
+    movsxd      r11, parm6d         ; i_src2_stride
+    mov         eax, parm7d         ; i_height
+
+ALIGN 4
+.height_loop    
+    movq        mm0, [parm3q   ]
+    movq        mm1, [parm3q+8 ]
+    movd        mm2, [parm3q+16]
+    pavgb       mm0, [r10   ]
+    pavgb       mm1, [r10+8 ]
+    pavgb       mm2, [r10+16]
+    movq        [parm1q   ], mm0
+    movq        [parm1q+8 ], mm1
+    movd        [parm1q+16], mm2
+    dec         eax
+    lea         parm3q, [parm3q+parm4q]
+    lea         r10, [r10+r11]
+    lea         parm1q, [parm1q+parm2q]
+    jg          .height_loop
      rep ret
  
  ALIGN 16
@@ -183,7 +214,7 @@ ALIGN 4
      lea         parm3q, [parm3q+parm4q]
      lea         r10, [r10+r11]
      lea         parm1q, [parm1q+parm2q]
-    jne         .height_loop
+    jg          .height_loop
      rep ret
  
  
@@ -244,7 +275,7 @@ x264_pixel_avg_weight_w16_mmxext:
      add  parm1q, parm2q
      add  parm3q, parm4q
      dec  r11d
-    jnz  .height_loop
+    jg   .height_loop
      rep ret
  
  ALIGN 16
@@ -260,7 +291,7 @@ x264_pixel_avg_weight_w8_mmxext:
      add  parm1q, parm2q
      add  parm3q, parm4q
      dec  r11d
-    jnz  .height_loop
+    jg   .height_loop
      rep ret
  
  ALIGN 16
@@ -301,7 +332,7 @@ ALIGN 4
      lea     parm1q, [parm1q+parm2q*2]
      dec     eax
      dec     eax
-    jne     .height_loop
+    jg      .height_loop
      rep ret
  
  ALIGN 16
@@ -329,7 +360,7 @@ ALIGN 4
      lea     parm1q, [parm1q+parm2q*4]
      
      sub     eax, byte 4
-    jnz     .height_loop
+    jg      .height_loop
      rep ret
  
  ALIGN 16
@@ -364,7 +395,7 @@ ALIGN 4
      lea     parm3q, [parm3q+parm4q*4]
      lea     parm1q, [parm1q+parm2q*4]
      sub     eax, byte 4
-    jnz     .height_loop
+    jg      .height_loop
      rep ret
  
  
@@ -384,7 +415,7 @@ ALIGN 4
      sub     eax, byte 2
      lea     parm3q, [parm3q+parm4q*2]
      lea     parm1q, [parm1q+parm2q*2]
-    jnz     .height_loop
+    jg      .height_loop
      rep ret
  
  
diff --git a/common/i386/mc-a.asm b/common/i386/mc-a.asm

index 6f233c4c4a40628a322f3b7bcda23efff9c6b9c7..2e68b994ba5b8635c314bf042b39bee1ce6bdc29 100644 (file)
--- a/common/i386/mc-a.asm
+++ b/common/i386/mc-a.asm
@@ -59,6 +59,7 @@ SECTION .text
  cglobal x264_pixel_avg_w4_mmxext
  cglobal x264_pixel_avg_w8_mmxext
  cglobal x264_pixel_avg_w16_mmxext
+cglobal x264_pixel_avg_w20_mmxext
  cglobal x264_pixel_avg_w16_sse2
  
  cglobal x264_pixel_avg_weight_4x4_mmxext
@@ -112,7 +113,7 @@ ALIGN 4
      lea         ebx, [ebx+eax*2]
      lea         ecx, [ecx+edx*2]
      lea         edi, [edi+esi*2]
-    jne         .height_loop
+    jg          .height_loop
  
      pop         edi
      pop         esi
@@ -151,7 +152,7 @@ ALIGN 4
      lea         ebx, [ebx+eax]
      lea         ecx, [ecx+edx]
      lea         edi, [edi+esi]
-    jne         .height_loop
+    jg          .height_loop
  
      pop         edi
      pop         esi
@@ -193,7 +194,7 @@ ALIGN 4
      lea         ebx, [ebx+eax]
      lea         ecx, [ecx+edx]
      lea         edi, [edi+esi]
-    jne         .height_loop
+    jg          .height_loop
  
      pop         edi
      pop         esi
@@ -201,6 +202,53 @@ ALIGN 4
      pop         ebp
      ret
  
+
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_pixel_avg_w20_mmxext( uint8_t *dst,  int i_dst_stride,
+;                                 uint8_t *src1, int i_src1_stride,
+;                                 uint8_t *src2, int i_src2_stride,
+;                                 int i_height );
+;-----------------------------------------------------------------------------
+x264_pixel_avg_w20_mmxext:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    mov         eax, [esp+32]       ; i_src1_stride
+    mov         edx, [esp+40]       ; i_src2_stride
+    mov         ebp, [esp+44]       ; i_height
+ALIGN 4
+.height_loop    
+    movq        mm0, [ebx   ]
+    movq        mm1, [ebx+8 ]
+    movd        mm2, [ebx+16]
+    pavgb       mm0, [ecx   ]
+    pavgb       mm1, [ecx+8 ]
+    pavgb       mm2, [ecx+16]
+    movq        [edi   ], mm0
+    movq        [edi+8 ], mm1
+    movd        [edi+16], mm2
+    dec         ebp
+    lea         ebx, [ebx+eax]
+    lea         ecx, [ecx+edx]
+    lea         edi, [edi+esi]
+    jg          .height_loop
+
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+
  ALIGN 16
  ;-----------------------------------------------------------------------------
  ; void x264_pixel_avg_w16_sse2( uint8_t *dst,  int i_dst_stride,
@@ -231,7 +279,7 @@ ALIGN 4
      lea         ebx, [ebx+eax]
      lea         ecx, [ecx+edx]
      lea         edi, [edi+esi]
-    jne         .height_loop
+    jg          .height_loop
  
      pop         edi
      pop         esi
@@ -302,7 +350,7 @@ x264_pixel_avg_weight_w16_mmxext:
      add  edi, esi
      add  edx, ecx
      dec  eax
-    jnz  .height_loop
+    jg   .height_loop
      BIWEIGHT_END_MMX
  
  ALIGN 16
@@ -323,7 +371,7 @@ x264_pixel_avg_weight_w8_mmxext:
      lea  edi, [edi+esi*2]
      lea  edx, [edx+ecx*2]
      sub  eax, byte 2
-    jnz  .height_loop
+    jg   .height_loop
      BIWEIGHT_END_MMX
  
  ALIGN 16
@@ -371,7 +419,7 @@ ALIGN 4
      lea     edi, [edi+edx*2]
      dec     ecx
      dec     ecx
-    jne     .height_loop
+    jg      .height_loop
  
      pop     edi
      pop     esi
@@ -409,7 +457,7 @@ ALIGN 4
      lea     edi, [edi+edx*2]
      
      sub     ecx, byte 4
-    jnz     .height_loop
+    jg      .height_loop
  
      pop     edi
      pop     esi
@@ -455,7 +503,7 @@ ALIGN 4
      lea     esi, [esi+ebx*2]
      lea     edi, [edi+edx*2]
      sub     ecx, byte 4
-    jnz     .height_loop
+    jg      .height_loop
      
      pop     edi
      pop     esi
@@ -488,7 +536,7 @@ ALIGN 4
      dec     ecx
      lea     esi, [esi+ebx*2]
      lea     edi, [edi+edx*2]
-    jnz     .height_loop
+    jg      .height_loop
      
      pop     edi
      pop     esi
diff --git a/common/i386/mc-c.c b/common/i386/mc-c.c

index 2f89b105199800322ebe8605d58a2e5cc26e8577..207c0c670bffd2afbc81f0ace7dc2cfb8c9625ab 100644 (file)
--- a/common/i386/mc-c.c
+++ b/common/i386/mc-c.c
@@ -31,6 +31,7 @@
  extern void x264_pixel_avg_w4_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  extern void x264_pixel_avg_w8_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  extern void x264_pixel_avg_w16_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
+extern void x264_pixel_avg_w20_mmxext( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  extern void x264_pixel_avg_w16_sse2( uint8_t *,  int, uint8_t *, int, uint8_t *, int, int );
  extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
  extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
@@ -68,13 +69,14 @@ AVG_WEIGHT(8,16)
  AVG_WEIGHT(8,8)
  AVG_WEIGHT(8,4)
  
-static void (* const x264_pixel_avg_wtab_mmxext[5])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) =
+static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) =
  {
      NULL,
      x264_pixel_avg_w4_mmxext,
      x264_pixel_avg_w8_mmxext,
-    NULL,
-    x264_pixel_avg_w16_mmxext
+    x264_pixel_avg_w16_mmxext,
+    x264_pixel_avg_w16_mmxext,
+    x264_pixel_avg_w20_mmxext,
  };
  static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =
  {
diff --git a/common/mc.h b/common/mc.h

index 9c9fe517ca39e0cb28e3905d76486280ca47dc45..2a0227374c170960d7f4fe9505f6f294bd934efd 100644 (file)
--- a/common/mc.h
+++ b/common/mc.h
@@ -37,6 +37,7 @@ typedef struct
                      int mvx, int mvy,
                      int i_width, int i_height );
  
+    /* may round up the dimensions if they're not a power of 2 */
      uint8_t* (*get_ref)(uint8_t **, int, uint8_t *, int *,
                          int mvx, int mvy,
                          int i_width, int i_height );
diff --git a/common/ppc/mc.c b/common/ppc/mc.c

index 2573f0e96bb51cbf7d0313130d88c0a2d9c8888b..96245174f0c5e296558739acc9f4fd340b10b697 100644 (file)
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -224,10 +224,19 @@ uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
              pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
                            src2, i_src_stride, i_height );
              break;
+        case 12:
          case 16:
          default:
              pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
                            src2, i_src_stride, i_height );
+            break;
+        case 20:
+            //FIXME suboptimal
+            pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
+                          src2, i_src_stride, i_height );
+            pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride,
+                          src2+16, i_src_stride, i_height );
+            break;
          }
          return dst;
  
diff --git a/encoder/me.c b/encoder/me.c

index 60edc7941a27e18c0ff26716d4389bfc985772c0..ccffe2dc042bf231d2db3a1402731df6dd90a9d5 100644 (file)
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -586,7 +586,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
      const int i_pixel = m->i_pixel;
      const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
  
-    DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
+    DECLARE_ALIGNED( uint8_t, pix[2][32*18], 16 ); // really 17x17, but round up for alignment
      int omx, omy;
      int i;
  
@@ -610,20 +610,12 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
      {
          int omx = bmx, omy = bmy;
          int costs[4];
-        int stride = 16; // candidates are either all hpel or all qpel, so one stride is enough
+        int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
          uint8_t *src0, *src1, *src2, *src3;
-        src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh );
-        src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[2], &stride, omx-2, omy, bw, bh );
-        if( (omx|omy)&1 )
-        {
-            src1 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx, omy+2, bw, bh );
-            src3 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[3], &stride, omx+2, omy, bw, bh );
-        }
-        else
-        {
-            src1 = src0 + stride;
-            src3 = src2 + 1;
-        }
+        src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh+1 );
+        src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx-2, omy, bw+4, bh );
+        src1 = src0 + stride;
+        src3 = src2 + 1;
          h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
          COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx  ] + p_cost_mvy[omy-2], bmy, omy-2 );
          COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx  ] + p_cost_mvy[omy+2], bmy, omy+2 );
author	Loren Merritt <pengvado@videolan.org>
	Mon, 2 Apr 2007 23:56:09 +0000 (23:56 +0000)
committer	Loren Merritt <pengvado@videolan.org>
	Mon, 2 Apr 2007 23:56:09 +0000 (23:56 +0000)
common/amd64/amd64inc.asm		patch \| blob \| history
common/amd64/mc-a.asm		patch \| blob \| history
common/i386/mc-a.asm		patch \| blob \| history
common/i386/mc-c.c		patch \| blob \| history
common/mc.h		patch \| blob \| history
common/ppc/mc.c		patch \| blob \| history
encoder/me.c		patch \| blob \| history