A few tweaks to decimate asm

author Fiona Glaser <fiona@x264.com>

Fri, 21 Nov 2008 04:11:14 +0000 (20:11 -0800)

committer Fiona Glaser <fiona@x264.com>

Fri, 21 Nov 2008 04:11:14 +0000 (20:11 -0800)
author Fiona Glaser <fiona@x264.com>
Fri, 21 Nov 2008 04:11:14 +0000 (20:11 -0800)
committer Fiona Glaser <fiona@x264.com>
Fri, 21 Nov 2008 04:11:14 +0000 (20:11 -0800)
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm

index f89eaf69bf7e869c4688bd13ff4d0fe799628312..80cf5b57044d25591229c4f8d438bfbb257fe47c 100644 (file)
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -521,16 +521,14 @@ cglobal x264_decimate_score64_%1, 1,4
      DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
      shl   r2, 48
      or    r1, r2
-    not   r1
-    test  r1, r1
+    xor   r1, -1
      je   .ret
      or    eax, r3d
      jne  .ret9
  .loop:
      bsf   rcx, r1
      shr   r1, cl
-    movzx ecx, byte [table + rcx]
-    add   eax, ecx
+    add   al, byte [table + rcx]
      shr   r1, 1
      jne  .loop
  .ret:
@@ -557,28 +555,33 @@ cglobal x264_decimate_score64_%1, 1,5
      DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
      shl   r1, 16
      or    r4, r1
-    not   r3
-    not   r4
-    mov   r1, r3
-    or    r1, r4
-    je   .ret
+    xor   r3, -1
+    je   .tryret
+    xor   r4, -1
+.cont
      or    r0, r2
-    jne  .ret9    ;r2 is zero at this point, so we don't need to zero it
+    jne  .ret9      ;r0 is zero at this point, so we don't need to zero it
  .loop:
      bsf   ecx, r3
      test  r3, r3
      je   .largerun
      shrd  r3, r4, cl
      shr   r4, cl
-    movzx ecx, byte [x264_decimate_table8 + ecx]
-    add   r0, ecx
+    add   r0b, byte [x264_decimate_table8 + ecx]
      shrd  r3, r4, 1
      shr   r4, 1
-    mov   r2, r3
-    or    r2, r4
+    cmp   r0, 6     ;score64's threshold is never higher than 6
+    jge  .ret9      ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
+    test  r3, r3
+    jne  .loop
+    test  r4, r4
      jne  .loop
  .ret:
      REP_RET
+.tryret:
+    xor   r4, -1
+    jne  .cont
+    REP_RET
  .ret9:
      mov   eax, 9
      RET
diff --git a/tools/checkasm.c b/tools/checkasm.c

index f7366b99bf447e2198b0f04bbddd6f2aa0c36c95..fb33a604d60390befd189fb5d9afca1e53ddab75 100644 (file)
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1078,7 +1078,7 @@ static int check_quant( int cpu_ref, int cpu_new )
      }
      report( "denoise dct :" );
  
-#define TEST_DECIMATE( qname, decname, block, w, ac ) \
+#define TEST_DECIMATE( qname, decname, block, w, ac, thresh ) \
      if( qf_a.decname != qf_ref.decname ) \
      { \
          set_func_name( #decname ); \
@@ -1093,7 +1093,7 @@ static int check_quant( int cpu_ref, int cpu_new )
              memcpy( dct2, dct1, w*w*2 ); \
              result_c = call_c1( qf_c.decname, (void*)dct2 ); \
              result_a = call_a1( qf_a.decname, (void*)dct2 ); \
-            if( result_c != result_a ) \
+            if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
              { \
                  ok = 0; \
                  fprintf( stderr, #decname ": [FAILED]\n" ); \
@@ -1104,9 +1104,9 @@ static int check_quant( int cpu_ref, int cpu_new )
          } \
      }
  
-    TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0 );
-    TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0 );
-    TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1 );
+    TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0, 6 );
+    TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0, 6 );
+    TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1, 7 );
      report( "decimate_score :" );
  
      return ret;
author	Fiona Glaser <fiona@x264.com>
	Fri, 21 Nov 2008 04:11:14 +0000 (20:11 -0800)
committer	Fiona Glaser <fiona@x264.com>
	Fri, 21 Nov 2008 04:11:14 +0000 (20:11 -0800)
common/x86/quant-a.asm		patch \| blob \| history
tools/checkasm.c		patch \| blob \| history