]> granicus.if.org Git - libx264/commitdiff
port sad_*_x3_sse2 to x86_64
authorLoren Merritt <pengvado@videolan.org>
Mon, 24 Sep 2007 13:37:44 +0000 (13:37 +0000)
committerLoren Merritt <pengvado@videolan.org>
Mon, 24 Sep 2007 13:37:44 +0000 (13:37 +0000)
git-svn-id: svn://svn.videolan.org/x264/trunk@680 df754926-b1dd-0310-bc7b-ec298dee348c

common/amd64/pixel-sse2.asm
common/pixel.c

index f8128dcebc359e0edf90588be7127abae748518e..21a28976ade39a10e0bda92bc100828391e57814 100644 (file)
@@ -71,8 +71,7 @@ SECTION .text
 %endmacro
 
 %macro SAD_END_SSE2 0
-    movdqa  xmm1, xmm0
-    psrldq  xmm0,  8
+    movhlps xmm1, xmm0
     paddw   xmm0, xmm1
     movd    eax,  xmm0
     ret
@@ -154,6 +153,138 @@ cglobal x264_pixel_sad_16x8_sse2
     SAD_INC_4x16P_SSE2
     SAD_END_SSE2
 
+
+; sad x3 / x4
+
+%macro SAD_X3_START_1x16P 0
+    movdqa xmm3, [parm1q]
+    movdqu xmm0, [parm2q]
+    movdqu xmm1, [parm3q]
+    movdqu xmm2, [parm4q]
+    psadbw xmm0, xmm3
+    psadbw xmm1, xmm3
+    psadbw xmm2, xmm3
+%endmacro
+
+%macro SAD_X3_1x16P 2
+    movdqa xmm3, [parm1q+%1]
+    movdqu xmm4, [parm2q+%2]
+    movdqu xmm5, [parm3q+%2]
+    movdqu xmm6, [parm4q+%2]
+    psadbw xmm4, xmm3
+    psadbw xmm5, xmm3
+    psadbw xmm6, xmm3
+    paddw  xmm0, xmm4
+    paddw  xmm1, xmm5
+    paddw  xmm2, xmm6
+%endmacro
+
+%macro SAD_X3_2x16P 1
+%if %1
+    SAD_X3_START_1x16P
+%else
+    SAD_X3_1x16P 0, 0
+%endif
+    SAD_X3_1x16P FENC_STRIDE, parm5q
+    add  parm1q, 2*FENC_STRIDE
+    lea  parm2q, [parm2q+2*parm5q]
+    lea  parm3q, [parm3q+2*parm5q]
+    lea  parm4q, [parm4q+2*parm5q]
+%endmacro
+
+%macro SAD_X4_START_1x16P 0
+    movdqa xmm7, [parm1q]
+    movdqu xmm0, [parm2q]
+    movdqu xmm1, [parm3q]
+    movdqu xmm2, [parm4q]
+    movdqu xmm3, [parm5q]
+    psadbw xmm0, xmm7
+    psadbw xmm1, xmm7
+    psadbw xmm2, xmm7
+    psadbw xmm3, xmm7
+%endmacro
+
+%macro SAD_X4_1x16P 2
+    movdqa xmm7, [parm1q+%1]
+    movdqu xmm4, [parm2q+%2]
+    movdqu xmm5, [parm3q+%2]
+    movdqu xmm6, [parm4q+%2]
+    movdqu xmm8, [parm5q+%2]
+    psadbw xmm4, xmm7
+    psadbw xmm5, xmm7
+    psadbw xmm6, xmm7
+    psadbw xmm8, xmm7
+    paddw  xmm0, xmm4
+    paddw  xmm1, xmm5
+    paddw  xmm2, xmm6
+    paddw  xmm3, xmm8
+%endmacro
+
+%macro SAD_X4_2x16P 1
+%if %1
+    SAD_X4_START_1x16P
+%else
+    SAD_X4_1x16P 0, 0
+%endif
+    SAD_X4_1x16P FENC_STRIDE, parm6q
+    add  parm1q, 2*FENC_STRIDE
+    lea  parm2q, [parm2q+2*parm6q]
+    lea  parm3q, [parm3q+2*parm6q]
+    lea  parm4q, [parm4q+2*parm6q]
+    lea  parm5q, [parm5q+2*parm6q]
+%endmacro
+
+%macro SAD_X3_END 0
+    movhlps xmm4, xmm0
+    movhlps xmm5, xmm1
+    movhlps xmm6, xmm2
+    paddw   xmm0, xmm4
+    paddw   xmm1, xmm5
+    paddw   xmm2, xmm6
+    movd [parm6q+0], xmm0
+    movd [parm6q+4], xmm1
+    movd [parm6q+8], xmm2
+    ret
+%endmacro
+
+%macro SAD_X4_END 0
+    mov      rax, parm7q
+    movhlps xmm4, xmm0
+    movhlps xmm5, xmm1
+    movhlps xmm6, xmm2
+    movhlps xmm7, xmm3
+    paddw   xmm0, xmm4
+    paddw   xmm1, xmm5
+    paddw   xmm2, xmm6
+    paddw   xmm3, xmm7
+    movd [rax+0], xmm0
+    movd [rax+4], xmm1
+    movd [rax+8], xmm2
+    movd [rax+12], xmm3
+    ret
+%endmacro
+
+;-----------------------------------------------------------------------------
+;  void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
+;                                     uint8_t *pix2, int i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+cglobal x264_pixel_sad_x%1_%2x%3_sse2
+    SAD_X%1_2x%2P 1
+%rep %3/2-1
+    SAD_X%1_2x%2P 0
+%endrep
+    SAD_X%1_END
+%endmacro
+
+SAD_X 3, 16, 16
+SAD_X 3, 16,  8
+SAD_X 4, 16, 16
+SAD_X 4, 16,  8
+
+
+; ssd
+
 %macro SSD_INC_2x16P_SSE2 0
     movdqu  xmm1,   [rdi]
     movdqu  xmm2,   [rdx]
index 39fb210eef3ef73d2aec319ee308594ad9317c9d..26dca08d410f45c65355be9356226a83262c8273 100644 (file)
@@ -514,13 +514,10 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->satd[PIXEL_8x8]  = x264_pixel_satd_8x8_sse2;
         pixf->satd[PIXEL_8x4]  = x264_pixel_satd_8x4_sse2;
 
-#ifdef ARCH_X86
         pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_sse2;
         pixf->sad_x3[PIXEL_16x8 ] = x264_pixel_sad_x3_16x8_sse2;
-
         pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_sse2;
         pixf->sad_x4[PIXEL_16x8 ] = x264_pixel_sad_x4_16x8_sse2;
-#endif
     }
     // these are faster on both Intel and AMD
     if( cpu&X264_CPU_SSE2 )