From 3de28cd5878a8be64e1db80ff2453e79acb0040d Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Tue, 9 May 2006 06:11:42 +0000 Subject: [PATCH] faster intra search: some prediction modes don't have to compute a full hadamard transform. x86 and amd64 asm. git-svn-id: svn://svn.videolan.org/x264/trunk@519 df754926-b1dd-0310-bc7b-ec298dee348c --- common/amd64/pixel-a.asm | 370 ++++++++++++++++++++ common/amd64/pixel-sse2.asm | 215 +++++++++++- common/i386/pixel-a.asm | 649 +++++++++++++++++++++++++++++++++-- common/i386/pixel.h | 7 + common/i386/predict-c.c | 54 ++- common/pixel.c | 5 + common/pixel.h | 7 + encoder/analyse.c | 126 +++++-- encoder/slicetype_decision.c | 24 +- tools/checkasm.c | 40 +++ 10 files changed, 1426 insertions(+), 71 deletions(-) diff --git a/common/amd64/pixel-a.asm b/common/amd64/pixel-a.asm index be067d89..f954a325 100644 --- a/common/amd64/pixel-a.asm +++ b/common/amd64/pixel-a.asm @@ -359,6 +359,12 @@ BITS 64 SBUTTERFLYdq %5, %2, %3 %endmacro +%macro MMX_ABS 2 ; mma, tmp + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 +%endmacro + %macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1 pxor %3, %3 pxor %4, %4 @@ -446,6 +452,11 @@ cglobal x264_pixel_satd_16x8_mmxext cglobal x264_pixel_satd_8x16_mmxext cglobal x264_pixel_satd_16x16_mmxext +cglobal x264_intra_satd_x3_4x4_mmxext +cglobal x264_intra_satd_x3_8x8c_mmxext +cglobal x264_intra_satd_x3_16x16_mmxext + + %macro SAD_START 0 pxor mm0, mm0 %endmacro @@ -740,3 +751,362 @@ x264_pixel_satd_16x16_mmxext: movd eax, mm0 ret + +; in: parm1 = fenc +; out: mm0..mm3 = hadamard coefs +ALIGN 16 +load_hadamard: + pxor mm7, mm7 + movd mm0, [parm1q+0*FENC_STRIDE] + movd mm4, [parm1q+1*FENC_STRIDE] + movd mm3, [parm1q+2*FENC_STRIDE] + movd mm1, [parm1q+3*FENC_STRIDE] + punpcklbw mm0, mm7 + punpcklbw mm4, mm7 + punpcklbw mm3, mm7 + punpcklbw mm1, mm7 + HADAMARD4x4 mm0, mm4, mm3, mm1 + TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2 + HADAMARD4x4 mm0, mm1, mm2, mm3 + ret + +%macro SCALAR_SUMSUB 4 + add %1, %2 + add %3, %4 + add %2, %2 + add %4, %4 + sub %2, %1 + sub %4, %3 +%endmacro + +%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op + pxor %7, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + paddw %1, %4 + paddw %2, %5 + paddw %3, %6 + punpcklwd %1, %7 + punpcklwd %2, %7 + punpcklwd %3, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + %8 %1, %4 + %8 %2, %5 + %8 %3, %6 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +x264_intra_satd_x3_4x4_mmxext: +%define top_1d rsp-8 ; +8 +%define left_1d rsp-16 ; +8 + call load_hadamard + + movzx r8d, byte [parm2q-1+0*FDEC_STRIDE] + movzx r9d, byte [parm2q-1+1*FDEC_STRIDE] + movzx r10d, byte [parm2q-1+2*FDEC_STRIDE] + movzx r11d, byte [parm2q-1+3*FDEC_STRIDE] + SCALAR_SUMSUB r8d, r9d, r10d, r11d + SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 1x4 hadamard + mov [left_1d+0], r8w + mov [left_1d+2], r9w + mov [left_1d+4], r10w + mov [left_1d+6], r11w + mov eax, r8d ; dc + + movzx r8d, byte [parm2q-FDEC_STRIDE+0] + movzx r9d, byte [parm2q-FDEC_STRIDE+1] + movzx r10d, byte [parm2q-FDEC_STRIDE+2] + movzx r11d, byte [parm2q-FDEC_STRIDE+3] + SCALAR_SUMSUB r8d, r9d, r10d, r11d + SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 4x1 hadamard + lea rax, [rax + r8 + 4] ; dc + mov [top_1d+0], r8w + mov [top_1d+2], r9w + mov [top_1d+4], r10w + mov [top_1d+6], r11w + and eax, -8 + shl eax, 1 + + movq mm4, mm1 + movq mm5, mm2 + MMX_ABS_TWO mm4, mm5, mm6, mm7 + movq mm7, mm3 + paddw mm4, mm5 + MMX_ABS mm7, mm6 + paddw mm7, mm4 ; 3x4 sum + + movq mm4, [left_1d] + movd mm5, eax + psllw mm4, 2 + psubw mm4, mm0 + psubw mm5, mm0 + punpcklwd mm0, mm1 + punpcklwd mm2, mm3 + punpckldq mm0, mm2 ; transpose + movq mm1, [top_1d] + psllw mm1, 2 + psubw mm0, mm1 + MMX_ABS mm4, mm3 ; 1x4 sum + MMX_ABS mm5, mm2 ; 1x4 sum + MMX_ABS mm0, mm1 ; 4x1 sum + paddw mm4, mm7 + paddw mm5, mm7 + movq mm1, mm5 + psrlq mm1, 16 ; 4x3 sum + paddw mm0, mm1 + + SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw + movd [parm3q+0], mm0 ; i4x4_v satd + movd [parm3q+4], mm4 ; i4x4_h satd + movd [parm3q+8], mm5 ; i4x4_dc satd + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +x264_intra_satd_x3_16x16_mmxext: +%define sums rsp-32 ; +24 +%define top_1d rsp-64 ; +32 +%define left_1d rsp-96 ; +32 + + mov qword [sums+0], 0 + mov qword [sums+8], 0 + mov qword [sums+16], 0 + + ; 1D hadamards + xor ecx, ecx + mov eax, 12 +.loop_edge: + ; left + shl eax, 5 ; log(FDEC_STRIDE) + movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE] + movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE] + movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE] + movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE] + shr eax, 5 + SCALAR_SUMSUB r8d, r9d, r10d, r11d + SCALAR_SUMSUB r8d, r10d, r9d, r11d + add ecx, r8d + mov [left_1d+2*rax+0], r8w + mov [left_1d+2*rax+2], r9w + mov [left_1d+2*rax+4], r10w + mov [left_1d+2*rax+6], r11w + + ; top + movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0] + movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1] + movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2] + movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3] + SCALAR_SUMSUB r8d, r9d, r10d, r11d + SCALAR_SUMSUB r8d, r10d, r9d, r11d + add ecx, r8d + mov [top_1d+2*rax+0], r8w + mov [top_1d+2*rax+2], r9w + mov [top_1d+2*rax+4], r10w + mov [top_1d+2*rax+6], r11w + sub eax, 4 + jge .loop_edge + + ; dc + shr ecx, 1 + add ecx, 8 + and ecx, -16 + + ; 2D hadamards + xor eax, eax +.loop_y: + xor esi, esi +.loop_x: + call load_hadamard + + movq mm4, mm1 + movq mm5, mm2 + MMX_ABS_TWO mm4, mm5, mm6, mm7 + movq mm7, mm3 + paddw mm4, mm5 + MMX_ABS mm7, mm6 + paddw mm7, mm4 ; 3x4 sum + + movq mm4, [left_1d+8*rax] + movd mm5, ecx + psllw mm4, 2 + psubw mm4, mm0 + psubw mm5, mm0 + punpcklwd mm0, mm1 + punpcklwd mm2, mm3 + punpckldq mm0, mm2 ; transpose + movq mm1, [top_1d+8*rsi] + psllw mm1, 2 + psubw mm0, mm1 + MMX_ABS mm4, mm3 ; 1x4 sum + MMX_ABS mm5, mm2 ; 1x4 sum + MMX_ABS mm0, mm1 ; 4x1 sum + pavgw mm4, mm7 + pavgw mm5, mm7 + paddw mm0, [sums+0] ; i4x4_v satd + paddw mm4, [sums+8] ; i4x4_h satd + paddw mm5, [sums+16] ; i4x4_dc satd + movq [sums+0], mm0 + movq [sums+8], mm4 + movq [sums+16], mm5 + + add parm1q, 4 + inc esi + cmp esi, 4 + jl .loop_x + add parm1q, 4*FENC_STRIDE-16 + inc eax + cmp eax, 4 + jl .loop_y + +; horizontal sum + movq mm2, [sums+16] + movq mm1, [sums+8] + movq mm0, [sums+0] + movq mm7, mm2 + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd + psrld mm0, 1 + pslld mm7, 16 + psrld mm7, 16 + paddd mm0, mm2 + psubd mm0, mm7 + movd [parm3q+8], mm2 ; i16x16_dc satd + movd [parm3q+4], mm1 ; i16x16_h satd + movd [parm3q+0], mm0 ; i16x16_v satd + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +x264_intra_satd_x3_8x8c_mmxext: +%define sums rsp-32 ; +24 +%define top_1d rsp-48 ; +16 +%define left_1d rsp-64 ; +16 + + mov qword [sums+0], 0 + mov qword [sums+8], 0 + mov qword [sums+16], 0 + + ; 1D hadamards + mov eax, 4 +.loop_edge: + ; left + shl eax, 5 ; log(FDEC_STRIDE) + movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE] + movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE] + movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE] + movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE] + shr eax, 5 + SCALAR_SUMSUB r8d, r9d, r10d, r11d + SCALAR_SUMSUB r8d, r10d, r9d, r11d + mov [left_1d+2*rax+0], r8w + mov [left_1d+2*rax+2], r9w + mov [left_1d+2*rax+4], r10w + mov [left_1d+2*rax+6], r11w + + ; top + movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0] + movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1] + movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2] + movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3] + SCALAR_SUMSUB r8d, r9d, r10d, r11d + SCALAR_SUMSUB r8d, r10d, r9d, r11d + mov [top_1d+2*rax+0], r8w + mov [top_1d+2*rax+2], r9w + mov [top_1d+2*rax+4], r10w + mov [top_1d+2*rax+6], r11w + sub eax, 4 + jge .loop_edge + + ; dc + movzx r8d, word [left_1d+0] + movzx r9d, word [top_1d+0] + movzx r10d, word [left_1d+8] + movzx r11d, word [top_1d+8] + add r8d, r9d + lea r9, [r10 + r11] + lea r8, [2*r8 + 8] + lea r9, [2*r9 + 8] + lea r10, [4*r10 + 8] + lea r11, [4*r11 + 8] + and r8d, -16 ; tl + and r9d, -16 ; br + and r10d, -16 ; bl + and r11d, -16 ; tr + shl r9, 16 + mov r9w, r10w + shl r9, 16 + mov r9w, r11w + shl r9, 16 + mov r9w, r8w + + ; 2D hadamards + xor eax, eax +.loop_y: + xor esi, esi +.loop_x: + call load_hadamard + + movq mm4, mm1 + movq mm5, mm2 + MMX_ABS_TWO mm4, mm5, mm6, mm7 + movq mm7, mm3 + paddw mm4, mm5 + MMX_ABS mm7, mm6 + paddw mm7, mm4 ; 3x4 sum + + movq mm4, [left_1d+8*rax] + movzx ecx, r9w + shr r9, 16 + movd mm5, ecx + psllw mm4, 2 + psubw mm4, mm0 + psubw mm5, mm0 + punpcklwd mm0, mm1 + punpcklwd mm2, mm3 + punpckldq mm0, mm2 ; transpose + movq mm1, [top_1d+8*rsi] + psllw mm1, 2 + psubw mm0, mm1 + MMX_ABS mm4, mm3 ; 1x4 sum + MMX_ABS mm5, mm2 ; 1x4 sum + MMX_ABS mm0, mm1 ; 4x1 sum + pavgw mm4, mm7 + pavgw mm5, mm7 + paddw mm0, [sums+16] ; i4x4_v satd + paddw mm4, [sums+8] ; i4x4_h satd + paddw mm5, [sums+0] ; i4x4_dc satd + movq [sums+16], mm0 + movq [sums+8], mm4 + movq [sums+0], mm5 + + add parm1q, 4 + inc esi + cmp esi, 2 + jl .loop_x + add parm1q, 4*FENC_STRIDE-8 + inc eax + cmp eax, 2 + jl .loop_y + +; horizontal sum + movq mm0, [sums+0] + movq mm1, [sums+8] + movq mm2, [sums+16] + movq mm7, mm0 + psrlq mm7, 15 + paddw mm2, mm7 + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd + psrld mm2, 1 + movd [parm3q+0], mm0 ; i8x8c_dc satd + movd [parm3q+4], mm1 ; i8x8c_h satd + movd [parm3q+8], mm2 ; i8x8c_v satd + ret diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm index f616ecd2..80b7e8fc 100644 --- a/common/amd64/pixel-sse2.asm +++ b/common/amd64/pixel-sse2.asm @@ -31,6 +31,7 @@ BITS 64 SECTION .rodata align=16 pd_0000ffff: times 4 dd 0x0000ffff +pb_1: times 16 db 1 SECTION .text @@ -47,6 +48,7 @@ cglobal x264_pixel_satd_8x16_sse2 cglobal x264_pixel_satd_16x16_sse2 cglobal x264_pixel_sa8d_8x8_sse2 cglobal x264_pixel_sa8d_16x16_sse2 +cglobal x264_intra_sa8d_x3_8x8_sse2 %macro SAD_INC_4x16P_SSE2 0 movdqu xmm1, [rdx] @@ -357,6 +359,13 @@ x264_pixel_ssd_16x8_sse2: punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3 %endmacro +%macro SUM1x8_SSE2 3 ; 01 junk sum + pxor %2, %2 + psubw %2, %1 + pmaxsw %1, %2 + paddusw %3, %1 +%endmacro + %macro SUM4x4_SSE2 4 ; 02 13 junk sum pxor %3, %3 psubw %3, %1 @@ -391,8 +400,6 @@ x264_pixel_ssd_16x8_sse2: %endmacro %macro SUM_MM_SSE2 2 ; sum junk - ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first. - psrlw %1, 1 movdqa %2, %1 psrldq %1, 2 paddusw %1, %2 @@ -423,6 +430,7 @@ x264_pixel_ssd_16x8_sse2: %endmacro %macro SATD_END 0 + psrlw xmm6, 1 SUM_MM_SSE2 xmm6, xmm7 ret %endmacro @@ -584,6 +592,7 @@ x264_pixel_sa8d_8x8_sse2: pxor xmm10, xmm10 SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10 SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10 + psrlw xmm10, 1 SUM_MM_SSE2 xmm10, xmm0 add r8d, eax ; preserve rounding for 16x16 add eax, 1 @@ -615,3 +624,205 @@ x264_pixel_sa8d_16x16_sse2: add eax, 1 shr eax, 1 ret + + + +%macro LOAD_HADAMARD8 1 + pxor xmm4, xmm4 + movq xmm0, [%1+0*FENC_STRIDE] + movq xmm7, [%1+1*FENC_STRIDE] + movq xmm6, [%1+2*FENC_STRIDE] + movq xmm3, [%1+3*FENC_STRIDE] + movq xmm5, [%1+4*FENC_STRIDE] + movq xmm1, [%1+5*FENC_STRIDE] + movq xmm8, [%1+6*FENC_STRIDE] + movq xmm2, [%1+7*FENC_STRIDE] + punpcklbw xmm0, xmm4 + punpcklbw xmm7, xmm4 + punpcklbw xmm6, xmm4 + punpcklbw xmm3, xmm4 + punpcklbw xmm5, xmm4 + punpcklbw xmm1, xmm4 + punpcklbw xmm8, xmm4 + punpcklbw xmm2, xmm4 + HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2 + TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4 + HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 +%endmacro + +%macro SCALAR_SUMSUB 4 + add %1, %2 + add %3, %4 + add %2, %2 + add %4, %4 + sub %2, %1 + sub %4, %3 +%endmacro + +%macro SCALAR_HADAMARD1x8 9 ; 8x tmp, dst + SCALAR_SUMSUB %1, %5, %2, %6 + SCALAR_SUMSUB %3, %7, %4, %8 + SCALAR_SUMSUB %1, %3, %2, %4 + SCALAR_SUMSUB %5, %7, %6, %8 + SCALAR_SUMSUB %1, %2, %3, %4 + SCALAR_SUMSUB %5, %6, %7, %8 + mov [%9+0], %1 + mov [%9+2], %2 + mov [%9+4], %3 + mov [%9+6], %4 + mov [%9+8], %5 + mov [%9+10], %6 + mov [%9+12], %7 + mov [%9+14], %8 +%endmacro + +; dest, left, right, src, tmp +; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 +%macro PRED8x8_LOWPASS 5 + movq %5, %2 + pavgb %2, %3 + pxor %3, %5 + movq %1, %4 + pand %3, [pb_1 GLOBAL] + psubusb %2, %3 + pavgb %1, %2 +%endmacro + +; output: mm0 = filtered t0..t7 +; assumes topleft is available +%macro PRED8x8_LOAD_TOP_FILT 1 + movq mm1, [%1-1] + movq mm2, [%1+1] + and parm4d, byte 4 + jne .have_topright + mov al, [%1+7] + mov ah, al + pinsrw mm2, eax, 3 +.have_topright: + PRED8x8_LOWPASS mm0, mm1, mm2, [%1], mm7 +%endmacro + +%macro PRED8x8_LOAD_LEFT_FILT 10 ; 8x reg, tmp, src + movzx %1, byte [%10-1*FDEC_STRIDE] + movzx %2, byte [%10+0*FDEC_STRIDE] + movzx %3, byte [%10+1*FDEC_STRIDE] + movzx %4, byte [%10+2*FDEC_STRIDE] + movzx %5, byte [%10+3*FDEC_STRIDE] + movzx %6, byte [%10+4*FDEC_STRIDE] + movzx %7, byte [%10+5*FDEC_STRIDE] + movzx %8, byte [%10+6*FDEC_STRIDE] + movzx %9, byte [%10+7*FDEC_STRIDE] + lea %1, [%1+%2+1] + lea %2, [%2+%3+1] + lea %3, [%3+%4+1] + lea %4, [%4+%5+1] + lea %5, [%5+%6+1] + lea %6, [%6+%7+1] + lea %7, [%7+%8+1] + lea %8, [%8+%9+1] + lea %9, [%9+%9+1] + add %1, %2 + add %2, %3 + add %3, %4 + add %4, %5 + add %5, %6 + add %6, %7 + add %7, %8 + add %8, %9 + shr %1, 2 + shr %2, 2 + shr %3, 2 + shr %4, 2 + shr %5, 2 + shr %6, 2 + shr %7, 2 + shr %8, 2 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_sa8d_x3_8x8_sse2( uint8_t *fenc, uint8_t *fdec, +; int *res, int i_neighbors ) +;----------------------------------------------------------------------------- +x264_intra_sa8d_x3_8x8_sse2: +%define left_1d rsp-16 ; +16 +%define top_1d rsp-32 ; +16 + push rbx + push r12 + push r13 + push r14 + push r15 + LOAD_HADAMARD8 parm1q + + PRED8x8_LOAD_LEFT_FILT r8, r9, r10, r11, r12, r13, r14, r15, rax, parm2q-1 + SCALAR_HADAMARD1x8 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d, left_1d + mov edi, r8d ; dc + + PRED8x8_LOAD_TOP_FILT parm2q-FDEC_STRIDE + movq [top_1d], mm0 + movzx r8d, byte [top_1d+0] + movzx r9d, byte [top_1d+1] + movzx r10d, byte [top_1d+2] + movzx r11d, byte [top_1d+3] + movzx r12d, byte [top_1d+4] + movzx r13d, byte [top_1d+5] + movzx r14d, byte [top_1d+6] + movzx r15d, byte [top_1d+7] + SCALAR_HADAMARD1x8 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w, top_1d + lea rdi, [rdi + r8 + 8] ; dc + and edi, -16 + shl edi, 2 + + pxor xmm15, xmm15 + movdqa xmm8, xmm2 + movdqa xmm9, xmm3 + movdqa xmm10, xmm4 + movdqa xmm11, xmm5 + SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15 + movdqa xmm8, xmm6 + movdqa xmm9, xmm7 + SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15 + movdqa xmm8, xmm1 + SUM1x8_SSE2 xmm8, xmm10, xmm15 + movdqa xmm14, xmm15 ; 7x8 sum + + movdqa xmm8, [left_1d] ; left edge + movd xmm9, edi + psllw xmm8, 3 + psubw xmm8, xmm0 + psubw xmm9, xmm0 + SUM1x8_SSE2 xmm8, xmm10, xmm14 + SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + punpcklwd xmm4, xmm5 + punpcklwd xmm6, xmm7 + punpckldq xmm0, xmm2 + punpckldq xmm4, xmm6 + punpcklqdq xmm0, xmm4 ; transpose + movdqa xmm1, [top_1d] + movdqa xmm2, xmm15 + psllw xmm1, 3 + psrldq xmm2, 2 ; 8x7 sum + psubw xmm0, xmm1 ; 8x1 sum + SUM1x8_SSE2 xmm0, xmm1, xmm2 + + SUM_MM_SSE2 xmm14, xmm3 + add eax, 2 + shr eax, 2 + mov [parm3q+4], eax ; i8x8_h sa8d + SUM_MM_SSE2 xmm15, xmm4 + add eax, 2 + shr eax, 2 + mov [parm3q+8], eax ; i8x8_dc sa8d + SUM_MM_SSE2 xmm2, xmm5 + add eax, 2 + shr eax, 2 + mov [parm3q+0], eax ; i8x8_v sa8d + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret diff --git a/common/i386/pixel-a.asm b/common/i386/pixel-a.asm index e7278fd6..331b1848 100644 --- a/common/i386/pixel-a.asm +++ b/common/i386/pixel-a.asm @@ -485,6 +485,12 @@ cglobal x264_pixel_satd_16x16_mmxext cglobal x264_pixel_sa8d_16x16_mmxext cglobal x264_pixel_sa8d_8x8_mmxext +cglobal x264_intra_satd_x3_4x4_mmxext +cglobal x264_intra_satd_x3_8x8c_mmxext +cglobal x264_intra_satd_x3_16x16_mmxext +cglobal x264_intra_sa8d_x3_8x8_core_mmxext + + %macro SAD_START 0 push ebx @@ -872,22 +878,23 @@ x264_pixel_sa8d_8x8_mmxext: SATD_START sub esp, 0x70 %define args esp+0x74 -%define spill esp+0x60 +%define spill esp+0x60 ; +16 +%define trans esp+0 ; +96 LOAD_DIFF_4x8P 0 HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 movq [spill], mm0 TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 - movq [esp+0x00], mm4 - movq [esp+0x08], mm7 - movq [esp+0x10], mm0 - movq [esp+0x18], mm6 + movq [trans+0x00], mm4 + movq [trans+0x08], mm7 + movq [trans+0x10], mm0 + movq [trans+0x18], mm6 movq mm0, [spill] TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 - movq [esp+0x20], mm0 - movq [esp+0x28], mm3 - movq [esp+0x30], mm4 - movq [esp+0x38], mm2 + movq [trans+0x20], mm0 + movq [trans+0x28], mm3 + movq [trans+0x30], mm4 + movq [trans+0x38], mm2 mov eax, [args+4] mov ecx, [args+12] @@ -896,29 +903,29 @@ x264_pixel_sa8d_8x8_mmxext: movq [spill], mm7 TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7 - movq [esp+0x40], mm0 - movq [esp+0x48], mm3 - movq [esp+0x50], mm7 - movq [esp+0x58], mm2 + movq [trans+0x40], mm0 + movq [trans+0x48], mm3 + movq [trans+0x50], mm7 + movq [trans+0x58], mm2 movq mm7, [spill] TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 - movq mm5, [esp+0x00] - movq mm1, [esp+0x08] - movq mm2, [esp+0x10] - movq mm3, [esp+0x18] + movq mm5, [trans+0x00] + movq mm1, [trans+0x08] + movq mm2, [trans+0x10] + movq mm3, [trans+0x18] HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 SUM4x8_MM - movq [esp], mm0 - - movq mm0, [esp+0x20] - movq mm1, [esp+0x28] - movq mm2, [esp+0x30] - movq mm3, [esp+0x38] - movq mm4, [esp+0x40] - movq mm5, [esp+0x48] - movq mm6, [esp+0x50] - movq mm7, [esp+0x58] + movq [trans], mm0 + + movq mm0, [trans+0x20] + movq mm1, [trans+0x28] + movq mm2, [trans+0x30] + movq mm3, [trans+0x38] + movq mm4, [trans+0x40] + movq mm5, [trans+0x48] + movq mm6, [trans+0x50] + movq mm7, [trans+0x58] HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 SUM4x8_MM @@ -938,6 +945,7 @@ x264_pixel_sa8d_8x8_mmxext: ret %undef args %undef spill +%undef trans ALIGN 16 ;----------------------------------------------------------------------------- @@ -976,3 +984,590 @@ x264_pixel_sa8d_16x16_mmxext: pop edi pop esi ret + + +; in: fenc +; out: mm0..mm3 = hadamard coefs +%macro LOAD_HADAMARD 1 + pxor mm7, mm7 + movd mm0, [%1+0*FENC_STRIDE] + movd mm4, [%1+1*FENC_STRIDE] + movd mm3, [%1+2*FENC_STRIDE] + movd mm1, [%1+3*FENC_STRIDE] + punpcklbw mm0, mm7 + punpcklbw mm4, mm7 + punpcklbw mm3, mm7 + punpcklbw mm1, mm7 + HADAMARD4x4 mm0, mm4, mm3, mm1 + TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2 + HADAMARD4x4 mm0, mm1, mm2, mm3 +%endmacro + +%macro SCALAR_SUMSUB 4 + add %1, %2 + add %3, %4 + add %2, %2 + add %4, %4 + sub %2, %1 + sub %4, %3 +%endmacro + +%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op + pxor %7, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + paddusw %1, %4 + paddusw %2, %5 + paddusw %3, %6 + punpcklwd %1, %7 + punpcklwd %2, %7 + punpcklwd %3, %7 + pshufw %4, %1, 01001110b + pshufw %5, %2, 01001110b + pshufw %6, %3, 01001110b + %8 %1, %4 + %8 %2, %5 + %8 %3, %6 +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +x264_intra_satd_x3_4x4_mmxext: + push ebx + push edi + push esi + sub esp, 16 +%define args esp+32 +%define top_1d esp+8 ; +8 +%define left_1d esp+0 ; +8 + + mov eax, [args+0] ; fenc + LOAD_HADAMARD eax + + mov edi, [args+4] ; fdec + movzx eax, byte [edi-1+0*FDEC_STRIDE] + movzx ebx, byte [edi-1+1*FDEC_STRIDE] + movzx ecx, byte [edi-1+2*FDEC_STRIDE] + movzx edx, byte [edi-1+3*FDEC_STRIDE] + SCALAR_SUMSUB eax, ebx, ecx, edx + SCALAR_SUMSUB eax, ecx, ebx, edx ; 1x4 hadamard + mov [left_1d+0], ax + mov [left_1d+2], bx + mov [left_1d+4], cx + mov [left_1d+6], dx + mov esi, eax ; dc + + movzx eax, byte [edi-FDEC_STRIDE+0] + movzx ebx, byte [edi-FDEC_STRIDE+1] + movzx ecx, byte [edi-FDEC_STRIDE+2] + movzx edx, byte [edi-FDEC_STRIDE+3] + SCALAR_SUMSUB eax, ebx, ecx, edx + SCALAR_SUMSUB eax, ecx, ebx, edx ; 4x1 hadamard + mov [top_1d+0], ax + mov [top_1d+2], bx + mov [top_1d+4], cx + mov [top_1d+6], dx + lea esi, [esi + eax + 4] ; dc + and esi, -8 + shl esi, 1 + + movq mm4, mm1 + movq mm5, mm2 + MMX_ABS_TWO mm4, mm5, mm6, mm7 + movq mm7, mm3 + paddw mm4, mm5 + MMX_ABS mm7, mm6 + paddw mm7, mm4 ; 3x4 sum + + movq mm4, [left_1d] + movd mm5, esi + psllw mm4, 2 + psubw mm4, mm0 + psubw mm5, mm0 + punpcklwd mm0, mm1 + punpcklwd mm2, mm3 + punpckldq mm0, mm2 ; transpose + movq mm1, [top_1d] + psllw mm1, 2 + psubw mm0, mm1 + MMX_ABS mm4, mm3 ; 1x4 sum + MMX_ABS mm5, mm2 ; 1x4 sum + MMX_ABS mm0, mm1 ; 4x1 sum + paddw mm4, mm7 + paddw mm5, mm7 + movq mm1, mm5 + psrlq mm1, 16 ; 4x3 sum + paddw mm0, mm1 + + SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw + mov eax, [args+8] ; res + movd [eax+0], mm0 ; i4x4_v satd + movd [eax+4], mm4 ; i4x4_h satd + movd [eax+8], mm5 ; i4x4_dc satd + + add esp, 16 + pop esi + pop edi + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +x264_intra_satd_x3_16x16_mmxext: + push ebx + push ebp + push edi + push esi + sub esp, 88 +%define args esp+108 +%define sums esp+64 ; +24 +%define top_1d esp+32 ; +32 +%define left_1d esp+0 ; +32 + + pxor mm0, mm0 + movq [sums+0], mm0 + movq [sums+8], mm0 + movq [sums+16], mm0 + + ; 1D hadamards + mov edi, [args+4] ; fdec + xor ebp, ebp + mov esi, 12 +.loop_edge: + ; left + shl esi, 5 ; log(FDEC_STRIDE) + movzx eax, byte [edi+esi-1+0*FDEC_STRIDE] + movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE] + movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE] + movzx edx, byte [edi+esi-1+3*FDEC_STRIDE] + shr esi, 5 + SCALAR_SUMSUB eax, ebx, ecx, edx + SCALAR_SUMSUB eax, ecx, ebx, edx + add ebp, eax + mov [left_1d+2*esi+0], ax + mov [left_1d+2*esi+2], bx + mov [left_1d+2*esi+4], cx + mov [left_1d+2*esi+6], dx + + ; top + movzx eax, byte [edi+esi-FDEC_STRIDE+0] + movzx ebx, byte [edi+esi-FDEC_STRIDE+1] + movzx ecx, byte [edi+esi-FDEC_STRIDE+2] + movzx edx, byte [edi+esi-FDEC_STRIDE+3] + SCALAR_SUMSUB eax, ebx, ecx, edx + SCALAR_SUMSUB eax, ecx, ebx, edx + add ebp, eax + mov [top_1d+2*esi+0], ax + mov [top_1d+2*esi+2], bx + mov [top_1d+2*esi+4], cx + mov [top_1d+2*esi+6], dx + sub esi, 4 + jge .loop_edge + + ; dc + shr ebp, 1 + add ebp, 8 + and ebp, -16 + + ; 2D hadamards + mov eax, [args+0] ; fenc + xor edi, edi +.loop_y: + xor esi, esi +.loop_x: + LOAD_HADAMARD eax + + movq mm4, mm1 + movq mm5, mm2 + MMX_ABS_TWO mm4, mm5, mm6, mm7 + movq mm7, mm3 + paddw mm4, mm5 + MMX_ABS mm7, mm6 + paddw mm7, mm4 ; 3x4 sum + + movq mm4, [left_1d+8*edi] + movd mm5, ebp + psllw mm4, 2 + psubw mm4, mm0 + psubw mm5, mm0 + punpcklwd mm0, mm1 + punpcklwd mm2, mm3 + punpckldq mm0, mm2 ; transpose + movq mm1, [top_1d+8*esi] + psllw mm1, 2 + psubw mm0, mm1 + MMX_ABS mm4, mm3 ; 1x4 sum + MMX_ABS mm5, mm2 ; 1x4 sum + MMX_ABS mm0, mm1 ; 4x1 sum + pavgw mm4, mm7 + pavgw mm5, mm7 + paddw mm0, [sums+0] ; i4x4_v satd + paddw mm4, [sums+8] ; i4x4_h satd + paddw mm5, [sums+16] ; i4x4_dc satd + movq [sums+0], mm0 + movq [sums+8], mm4 + movq [sums+16], mm5 + + add eax, 4 + inc esi + cmp esi, 4 + jl .loop_x + add eax, 4*FENC_STRIDE-16 + inc edi + cmp edi, 4 + jl .loop_y + +; horizontal sum + movq mm2, [sums+16] + movq mm0, [sums+0] + movq mm1, [sums+8] + movq mm7, mm2 + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd + psrld mm0, 1 + pslld mm7, 16 + psrld mm7, 16 + paddd mm0, mm2 + psubd mm0, mm7 + mov eax, [args+8] ; res + movd [eax+0], mm0 ; i16x16_v satd + movd [eax+4], mm1 ; i16x16_h satd + movd [eax+8], mm2 ; i16x16_dc satd + + add esp, 88 + pop esi + pop edi + pop ebp + pop ebx + ret + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res ) +;----------------------------------------------------------------------------- +x264_intra_satd_x3_8x8c_mmxext: + push ebx + push ebp + push edi + push esi + sub esp, 72 +%define args esp+92 +%define sums esp+48 ; +24 +%define dc_1d esp+32 ; +16 +%define top_1d esp+16 ; +16 +%define left_1d esp+0 ; +16 + + pxor mm0, mm0 + movq [sums+0], mm0 + movq [sums+8], mm0 + movq [sums+16], mm0 + + ; 1D hadamards + mov edi, [args+4] ; fdec + xor ebp, ebp + mov esi, 12 +.loop_edge: + ; left + shl esi, 5 ; log(FDEC_STRIDE) + movzx eax, byte [edi+esi-1+0*FDEC_STRIDE] + movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE] + movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE] + movzx edx, byte [edi+esi-1+3*FDEC_STRIDE] + shr esi, 5 + SCALAR_SUMSUB eax, ebx, ecx, edx + SCALAR_SUMSUB eax, ecx, ebx, edx + mov [left_1d+2*esi+0], ax + mov [left_1d+2*esi+2], bx + mov [left_1d+2*esi+4], cx + mov [left_1d+2*esi+6], dx + + ; top + movzx eax, byte [edi+esi-FDEC_STRIDE+0] + movzx ebx, byte [edi+esi-FDEC_STRIDE+1] + movzx ecx, byte [edi+esi-FDEC_STRIDE+2] + movzx edx, byte [edi+esi-FDEC_STRIDE+3] + SCALAR_SUMSUB eax, ebx, ecx, edx + SCALAR_SUMSUB eax, ecx, ebx, edx + mov [top_1d+2*esi+0], ax + mov [top_1d+2*esi+2], bx + mov [top_1d+2*esi+4], cx + mov [top_1d+2*esi+6], dx + sub esi, 4 + jge .loop_edge + + ; dc + movzx eax, word [left_1d+0] + movzx ebx, word [top_1d+0] + movzx ecx, word [left_1d+8] + movzx edx, word [top_1d+8] + add eax, ebx + lea ebx, [ecx + edx] + lea eax, [2*eax + 8] + lea ebx, [2*ebx + 8] + lea ecx, [4*ecx + 8] + lea edx, [4*edx + 8] + and eax, -16 + and ebx, -16 + and ecx, -16 + and edx, -16 + mov [dc_1d+ 0], eax ; tl + mov [dc_1d+ 4], edx ; tr + mov [dc_1d+ 8], ecx ; bl + mov [dc_1d+12], ebx ; br + lea ebp, [dc_1d] + + ; 2D hadamards + mov eax, [args+0] ; fenc + xor edi, edi +.loop_y: + xor esi, esi +.loop_x: + LOAD_HADAMARD eax + + movq mm4, mm1 + movq mm5, mm2 + MMX_ABS_TWO mm4, mm5, mm6, mm7 + movq mm7, mm3 + paddw mm4, mm5 + MMX_ABS mm7, mm6 + paddw mm7, mm4 ; 3x4 sum + + movq mm4, [left_1d+8*edi] + movd mm5, [ebp] + psllw mm4, 2 + psubw mm4, mm0 + psubw mm5, mm0 + punpcklwd mm0, mm1 + punpcklwd mm2, mm3 + punpckldq mm0, mm2 ; transpose + movq mm1, [top_1d+8*esi] + psllw mm1, 2 + psubw mm0, mm1 + MMX_ABS mm4, mm3 ; 1x4 sum + MMX_ABS mm5, mm2 ; 1x4 sum + MMX_ABS mm0, mm1 ; 4x1 sum + pavgw mm4, mm7 + pavgw mm5, mm7 + paddw mm0, [sums+16] ; i4x4_v satd + paddw mm4, [sums+8] ; i4x4_h satd + paddw mm5, [sums+0] ; i4x4_dc satd + movq [sums+16], mm0 + movq [sums+8], mm4 + movq [sums+0], mm5 + + add eax, 4 + add ebp, 4 + inc esi + cmp esi, 2 + jl .loop_x + add eax, 4*FENC_STRIDE-8 + inc edi + cmp edi, 2 + jl .loop_y + +; horizontal sum + movq mm0, [sums+0] + movq mm1, [sums+8] + movq mm2, [sums+16] + movq mm6, mm0 + psrlq mm6, 15 + paddw mm2, mm6 + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm7, paddd + psrld mm2, 1 + mov eax, [args+8] ; res + movd [eax+0], mm0 ; i8x8c_dc satd + movd [eax+4], mm1 ; i8x8c_h satd + movd [eax+8], mm2 ; i8x8c_v satd + + add esp, 72 + pop esi + pop edi + pop ebp + pop ebx + ret + +%macro LOAD_4x8P 1 ; dx + pxor mm7, mm7 + movd mm6, [eax+%1+7*FENC_STRIDE] + movd mm0, [eax+%1+0*FENC_STRIDE] + movd mm1, [eax+%1+1*FENC_STRIDE] + movd mm2, [eax+%1+2*FENC_STRIDE] + movd mm3, [eax+%1+3*FENC_STRIDE] + movd mm4, [eax+%1+4*FENC_STRIDE] + movd mm5, [eax+%1+5*FENC_STRIDE] + punpcklbw mm6, mm7 + punpcklbw mm0, mm7 + punpcklbw mm1, mm7 + movq [spill], mm6 + punpcklbw mm2, mm7 + punpcklbw mm3, mm7 + movd mm6, [eax+%1+6*FENC_STRIDE] + punpcklbw mm4, mm7 + punpcklbw mm5, mm7 + punpcklbw mm6, mm7 + movq mm7, [spill] +%endmacro + +ALIGN 16 +;----------------------------------------------------------------------------- +; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res ) +;----------------------------------------------------------------------------- +x264_intra_sa8d_x3_8x8_core_mmxext: + mov eax, [esp+4] + mov ecx, [esp+8] + sub esp, 0x70 +%define args esp+0x74 +%define spill esp+0x60 ; +16 +%define trans esp+0 ; +96 +%define sum esp+0 ; +32 + LOAD_4x8P 0 + HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm0 + TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 + movq [trans+0x00], mm4 + movq [trans+0x08], mm7 + movq [trans+0x10], mm0 + movq [trans+0x18], mm6 + movq mm0, [spill] + TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4 + movq [trans+0x20], mm0 + movq [trans+0x28], mm3 + movq [trans+0x30], mm4 + movq [trans+0x38], mm2 + + LOAD_4x8P 4 + HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movq [spill], mm7 + TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7 + movq [trans+0x40], mm0 + movq [trans+0x48], mm3 + movq [trans+0x50], mm7 + movq [trans+0x58], mm2 + movq mm7, [spill] + TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 + movq mm5, [trans+0x00] + movq mm1, [trans+0x08] + movq mm2, [trans+0x10] + movq mm3, [trans+0x18] + + HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6 + + movq [spill+0], mm5 + movq [spill+8], mm7 + MMX_ABS_TWO mm0, mm1, mm5, mm7 + MMX_ABS_TWO mm2, mm3, mm5, mm7 + paddw mm0, mm2 + paddw mm1, mm3 + paddw mm0, mm1 + MMX_ABS_TWO mm4, mm6, mm2, mm3 + movq mm5, [spill+0] + movq mm7, [spill+8] + paddw mm0, mm4 + paddw mm0, mm6 + MMX_ABS mm7, mm1 + paddw mm0, mm7 ; 7x4 sum + movq mm6, mm5 + movq mm7, [ecx+8] ; left bottom + psllw mm7, 3 + psubw mm6, mm7 + MMX_ABS_TWO mm5, mm6, mm2, mm3 + paddw mm5, mm0 + paddw mm6, mm0 + movq [sum+0], mm5 ; dc + movq [sum+8], mm6 ; left + + movq mm0, [trans+0x20] + movq mm1, [trans+0x28] + movq mm2, [trans+0x30] + movq mm3, [trans+0x38] + movq mm4, [trans+0x40] + movq mm5, [trans+0x48] + movq mm6, [trans+0x50] + movq mm7, [trans+0x58] + + HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7 + + movd [sum+0x10], mm0 + movd [sum+0x12], mm1 + movd [sum+0x14], mm2 + movd [sum+0x16], mm3 + movd [sum+0x18], mm4 + movd [sum+0x1a], mm5 + movd [sum+0x1c], mm6 + movd [sum+0x1e], mm7 + + movq [spill], mm0 + movq [spill+8], mm1 + MMX_ABS_TWO mm2, mm3, mm0, mm1 + MMX_ABS_TWO mm4, mm5, mm0, mm1 + paddw mm2, mm3 + paddw mm4, mm5 + paddw mm2, mm4 + movq mm0, [spill] + movq mm1, [spill+8] + MMX_ABS_TWO mm6, mm7, mm4, mm5 + MMX_ABS mm1, mm4 + paddw mm2, mm7 + paddw mm1, mm6 + paddw mm2, mm1 ; 7x4 sum + movq mm1, mm0 + + movq mm7, [ecx+0] + psllw mm7, 3 ; left top + + movzx edx, word [ecx+0] + add dx, [ecx+16] + lea edx, [4*edx+32] + and edx, -64 + movd mm6, edx ; dc + + psubw mm1, mm7 + psubw mm0, mm6 + MMX_ABS_TWO mm0, mm1, mm5, mm6 + movq mm3, [sum+0] ; dc + paddw mm0, mm2 + paddw mm1, mm2 + movq mm2, mm0 + paddw mm0, mm3 + paddw mm1, [sum+8] ; h + psrlq mm2, 16 + paddw mm2, mm3 + + movq mm3, [ecx+16] ; top left + movq mm4, [ecx+24] ; top right + psllw mm3, 3 + psllw mm4, 3 + psubw mm3, [sum+16] + psubw mm4, [sum+24] + MMX_ABS_TWO mm3, mm4, mm5, mm6 + paddw mm2, mm3 + paddw mm2, mm4 ; v + + SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd + mov eax, [args+8] + movd ecx, mm2 + movd edx, mm1 + add ecx, 2 + add edx, 2 + shr ecx, 2 + shr edx, 2 + mov [eax+0], ecx ; i8x8_v satd + mov [eax+4], edx ; i8x8_h satd + movd ecx, mm0 + add ecx, 2 + shr ecx, 2 + mov [eax+8], ecx ; i8x8_dc satd + + add esp, 0x70 + ret +%undef args +%undef spill +%undef trans +%undef sum + diff --git a/common/i386/pixel.h b/common/i386/pixel.h index 8baf7c7a..d5c0c2b8 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -90,4 +90,11 @@ int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ); int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ); +void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * ); +void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * ); +void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int *, int ); +void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int *, int ); +void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * ); + #endif diff --git a/common/i386/predict-c.c b/common/i386/predict-c.c index e46f32d6..3176afdc 100644 --- a/common/i386/predict-c.c +++ b/common/i386/predict-c.c @@ -24,6 +24,7 @@ #include "common/common.h" #include "common/clip1.h" #include "predict.h" +#include "pixel.h" extern void predict_16x16_v_mmx( uint8_t *src ); extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ); @@ -416,20 +417,20 @@ static void predict_4x4_hu( uint8_t *src ) ****************************************************************************/ #define PL(y) \ - const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; + int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2; #define PREDICT_8x8_LOAD_LEFT(have_tl) \ - const int l0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(-1,0)) \ + int l0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(-1,0)) \ + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \ PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \ - UNUSED const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2; + UNUSED int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2; #define PT(x) \ - const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; + int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2; #define PREDICT_8x8_LOAD_TOP(have_tl) \ - const int t0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(0,-1)) \ + int t0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(0,-1)) \ + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \ PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \ - UNUSED const int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \ + UNUSED int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \ + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; \ #define PTR(x) \ @@ -442,7 +443,7 @@ static void predict_4x4_hu( uint8_t *src ) } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1); #define PREDICT_8x8_LOAD_TOPLEFT \ - const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2; + int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2; #define PREDICT_8x8_DC(v) \ int y; \ @@ -549,6 +550,45 @@ static void predict_8x8_vr_mmxext( uint8_t *src, int i_neighbor ) } #endif +#ifdef ARCH_X86 +#define SUMSUB(a,b,c,d,e,f,g,h)\ + t=a; a+=b; b-=t;\ + t=c; c+=d; d-=t;\ + t=e; e+=f; f-=t;\ + t=g; g+=h; h-=t; + +void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *fenc, uint8_t *src, int res[3], int i_neighbor ) +{ + PREDICT_8x8_LOAD_TOP(1) + PREDICT_8x8_LOAD_LEFT(1) + int t; + int16_t edges[2][8]; + SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7); + SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7); + SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7); + edges[0][0] = l0; + edges[0][1] = l1; + edges[0][2] = l2; + edges[0][3] = l3; + edges[0][4] = l4; + edges[0][5] = l5; + edges[0][6] = l6; + edges[0][7] = l7; + SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7); + SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7); + SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7); + edges[1][0] = t0; + edges[1][1] = t1; + edges[1][2] = t2; + edges[1][3] = t3; + edges[1][4] = t4; + edges[1][5] = t5; + edges[1][6] = t6; + edges[1][7] = t7; + x264_intra_sa8d_x3_8x8_core_mmxext( fenc, edges, res ); +} +#endif + /**************************************************************************** * Exported functions: ****************************************************************************/ diff --git a/common/pixel.c b/common/pixel.c index d98b7c7e..54b03553 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -369,7 +369,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext; #endif + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext; } #endif @@ -403,6 +407,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) #ifdef ARCH_X86_64 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif } #endif diff --git a/common/pixel.h b/common/pixel.h index 18a0746f..ba3c3346 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -78,6 +78,13 @@ typedef struct /* multiple parallel calls to sad. */ x264_pixel_cmp_x3_t sad_x3[7]; x264_pixel_cmp_x4_t sad_x4[7]; + + /* calculate satd of V, H, and DC modes. + * may be NULL, in which case just use pred+satd instead. */ + void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] ); + void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] ); + void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] ); + void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t *fdec, int res[3], int i_neighbors ); } x264_pixel_function_t; void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ); diff --git a/encoder/analyse.c b/encoder/analyse.c index 95f11161..cc056c98 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -437,25 +437,46 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); a->i_satd_i8x8chroma = COST_MAX; - for( i = 0; i < i_max; i++ ) + if( i_max == 4 && h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) + { + int satdu[4], satdv[4]; + h->pixf.intra_satd_x3_8x8c( p_srcc[0], p_dstc[0], satdu ); + h->pixf.intra_satd_x3_8x8c( p_srcc[1], p_dstc[1], satdv ); + h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] ); + h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] ); + satdu[I_PRED_CHROMA_P] = + h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE ); + satdv[I_PRED_CHROMA_P] = + h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE ); + + for( i=0; ii_lambda * bs_size_ue(i_mode); + COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode ); + } + } + else { - int i_satd; - int i_mode; - - i_mode = predict_mode[i]; + for( i=0; ipredict_8x8c[i_mode]( p_dstc[0] ); - h->predict_8x8c[i_mode]( p_dstc[1] ); + /* we do the prediction */ + h->predict_8x8c[i_mode]( p_dstc[0] ); + h->predict_8x8c[i_mode]( p_dstc[1] ); - /* we calculate the cost */ - i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, - p_srcc[0], FENC_STRIDE ) + - h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, - p_srcc[1], FENC_STRIDE ) + - a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] ); + /* we calculate the cost */ + i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, + p_srcc[0], FENC_STRIDE ) + + h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, + p_srcc[1], FENC_STRIDE ) + + a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] ); - COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode ); + COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode ); + } } h->mb.i_chroma_pred_mode = a->i_predict8x8chroma; @@ -470,21 +491,38 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i, idx; int i_max; int predict_mode[9]; + int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0]; /*---------------- Try all mode and calculate their score ---------------*/ /* 16x16 prediction selection */ predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max ); - for( i = 0; i < i_max; i++ ) + + if( b_merged_satd && i_max == 4 ) { - int i_satd; - int i_mode = predict_mode[i]; - h->predict_16x16[i_mode]( p_dst ); + h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir ); + h->predict_16x16[I_PRED_16x16_P]( p_dst ); + a->i_satd_i16x16_dir[I_PRED_16x16_P] = + h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); + for( i=0; i<4; i++ ) + { + int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i); + COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i ); + } + } + else + { + for( i = 0; i < i_max; i++ ) + { + int i_satd; + int i_mode = predict_mode[i]; + h->predict_16x16[i_mode]( p_dst ); - i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) + - a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); - COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode ); - a->i_satd_i16x16_dir[i_mode] = i_satd; + i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) + + a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); + COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode ); + a->i_satd_i16x16_dir[i_mode] = i_satd; + } } if( h->sh.i_type == SLICE_TYPE_B ) @@ -499,6 +537,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8]; int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 ); int i_cost = 0; + b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0]; // FIXME some bias like in i4x4? if( h->sh.i_type == SLICE_TYPE_B ) @@ -514,7 +553,24 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx ); predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max ); - for( i = 0; i < i_max; i++ ) + + if( b_merged_satd && i_max == 9 ) + { + int satd[3]; + h->pixf.intra_sa8d_x3_8x8( p_src_by, p_dst_by, satd, h->mb.i_neighbour8[idx] ); + if( i_pred_mode < 3 ) + satd[i_pred_mode] -= 3 * a->i_lambda; + for( i=2; i>=0; i-- ) + { + int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda; + COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i ); + } + i = 3; + } + else + i = 0; + + for( ; ii_satd_i16x16, a->i_satd_i8x8 ); + b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0]; if( a->b_mbrd ) i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8; @@ -577,16 +634,29 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ /* emulate missing topright samples */ *(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U; - for( i = 0; i < i_max; i++ ) + if( b_merged_satd && i_max >= 6 ) + { + int satd[3]; + h->pixf.intra_satd_x3_4x4( p_src_by, p_dst_by, satd ); + if( i_pred_mode < 3 ) + satd[i_pred_mode] -= 3 * a->i_lambda; + for( i=2; i>=0; i-- ) + COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda, + a->i_predict4x4[idx], i ); + i = 3; + } + else + i = 0; + + for( ; ipredict_4x4[i_mode]( p_dst_by ); i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, - p_src_by, FENC_STRIDE ) + p_src_by, FENC_STRIDE ) + a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4); COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode ); diff --git a/encoder/slicetype_decision.c b/encoder/slicetype_decision.c index bc3effa5..6df55e3c 100644 --- a/encoder/slicetype_decision.c +++ b/encoder/slicetype_decision.c @@ -200,22 +200,32 @@ lowres_intra_mb: uint8_t *pix = &pix_buf[8+FDEC_STRIDE - 1]; uint8_t *src = &fenc->lowres[0][i_pel_offset - 1]; int intra_penalty = 5 + 10 * b_bidir; - i_cost_bak = i_bcost; + int satds[4], i_icost; memcpy( pix-FDEC_STRIDE, src-i_stride, 9 ); for( i=0; i<8; i++ ) pix[i*FDEC_STRIDE] = src[i*i_stride]; pix++; - for( i = I_PRED_CHROMA_DC; i <= I_PRED_CHROMA_P; i++ ) + if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] ) { - int i_cost; - h->predict_8x8c[i]( pix ); - i_cost = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ) + intra_penalty; - i_bcost = X264_MIN( i_bcost, i_cost ); + h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds ); + h->predict_8x8c[I_PRED_CHROMA_P]( pix ); + satds[I_PRED_CHROMA_P] = + h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); } - if( i_bcost != i_cost_bak ) + else + { + for( i=0; i<4; i++ ) + { + h->predict_8x8c[i]( pix ); + satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + } + } + i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ) + intra_penalty; + if( i_icost < i_bcost ) { + i_bcost = i_icost; if( !b_bidir && i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1 && i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1 ) diff --git a/tools/checkasm.c b/tools/checkasm.c index e91f1bf8..680b64bb 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -32,12 +32,20 @@ static int check_pixel( int cpu_ref, int cpu_new ) x264_pixel_function_t pixel_c; x264_pixel_function_t pixel_ref; x264_pixel_function_t pixel_asm; + x264_predict_t predict_16x16[4+3]; + x264_predict_t predict_8x8c[4+3]; + x264_predict_t predict_4x4[9+3]; + x264_predict8x8_t predict_8x8[9+3]; int ret = 0, ok, used_asm; int i; x264_pixel_init( 0, &pixel_c ); x264_pixel_init( cpu_ref, &pixel_ref ); x264_pixel_init( cpu_new, &pixel_asm ); + x264_predict_16x16_init( 0, predict_16x16 ); + x264_predict_8x8c_init( 0, predict_8x8c ); + x264_predict_8x8_init( 0, predict_8x8 ); + x264_predict_4x4_init( 0, predict_4x4 ); #define TEST_PIXEL( name ) \ for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \ @@ -92,6 +100,38 @@ static int check_pixel( int cpu_ref, int cpu_new ) TEST_PIXEL_X(3); TEST_PIXEL_X(4); + +#define TEST_INTRA_SATD( name, pred, satd, ... ) \ + if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \ + { \ + int res_c[3], res_asm[3]; \ + used_asm = 1; \ + memcpy( buf3, buf2, 1024 ); \ + for( i=0; i<3; i++ ) \ + { \ + pred[i]( buf3+40, ##__VA_ARGS__ ); \ + res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \ + } \ + pixel_asm.name( buf1+40, buf3+40, res_asm, ##__VA_ARGS__ ); \ + if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ + { \ + ok = 0; \ + fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \ + res_c[0], res_c[1], res_c[2], \ + res_asm[0], res_asm[1], res_asm[2] ); \ + } \ + } + + ok = 1; used_asm = 0; + TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16] ); + TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8] ); + TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4] ); + TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], + MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT ); + TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], + MB_LEFT|MB_TOP|MB_TOPLEFT ); + report( "intra satd_x3 :" ); + return ret; } -- 2.40.0