SBUTTERFLYdq %5, %2, %3
%endmacro
+%macro MMX_ABS 2 ; mma, tmp
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+%endmacro
+
%macro MMX_ABS_TWO 4 ; mma, mmb, tmp0, tmp1
pxor %3, %3
pxor %4, %4
cglobal x264_pixel_satd_8x16_mmxext
cglobal x264_pixel_satd_16x16_mmxext
+cglobal x264_intra_satd_x3_4x4_mmxext
+cglobal x264_intra_satd_x3_8x8c_mmxext
+cglobal x264_intra_satd_x3_16x16_mmxext
+
+
%macro SAD_START 0
pxor mm0, mm0
%endmacro
movd eax, mm0
ret
+
+; in: parm1 = fenc
+; out: mm0..mm3 = hadamard coefs
+ALIGN 16
+load_hadamard:
+ pxor mm7, mm7
+ movd mm0, [parm1q+0*FENC_STRIDE]
+ movd mm4, [parm1q+1*FENC_STRIDE]
+ movd mm3, [parm1q+2*FENC_STRIDE]
+ movd mm1, [parm1q+3*FENC_STRIDE]
+ punpcklbw mm0, mm7
+ punpcklbw mm4, mm7
+ punpcklbw mm3, mm7
+ punpcklbw mm1, mm7
+ HADAMARD4x4 mm0, mm4, mm3, mm1
+ TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2
+ HADAMARD4x4 mm0, mm1, mm2, mm3
+ ret
+
+%macro SCALAR_SUMSUB 4
+ add %1, %2
+ add %3, %4
+ add %2, %2
+ add %4, %4
+ sub %2, %1
+ sub %4, %3
+%endmacro
+
+%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
+ pxor %7, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ paddw %1, %4
+ paddw %2, %5
+ paddw %3, %6
+ punpcklwd %1, %7
+ punpcklwd %2, %7
+ punpcklwd %3, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ %8 %1, %4
+ %8 %2, %5
+ %8 %3, %6
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+x264_intra_satd_x3_4x4_mmxext:
+%define top_1d rsp-8 ; +8
+%define left_1d rsp-16 ; +8
+ call load_hadamard
+
+ movzx r8d, byte [parm2q-1+0*FDEC_STRIDE]
+ movzx r9d, byte [parm2q-1+1*FDEC_STRIDE]
+ movzx r10d, byte [parm2q-1+2*FDEC_STRIDE]
+ movzx r11d, byte [parm2q-1+3*FDEC_STRIDE]
+ SCALAR_SUMSUB r8d, r9d, r10d, r11d
+ SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 1x4 hadamard
+ mov [left_1d+0], r8w
+ mov [left_1d+2], r9w
+ mov [left_1d+4], r10w
+ mov [left_1d+6], r11w
+ mov eax, r8d ; dc
+
+ movzx r8d, byte [parm2q-FDEC_STRIDE+0]
+ movzx r9d, byte [parm2q-FDEC_STRIDE+1]
+ movzx r10d, byte [parm2q-FDEC_STRIDE+2]
+ movzx r11d, byte [parm2q-FDEC_STRIDE+3]
+ SCALAR_SUMSUB r8d, r9d, r10d, r11d
+ SCALAR_SUMSUB r8d, r10d, r9d, r11d ; 4x1 hadamard
+ lea rax, [rax + r8 + 4] ; dc
+ mov [top_1d+0], r8w
+ mov [top_1d+2], r9w
+ mov [top_1d+4], r10w
+ mov [top_1d+6], r11w
+ and eax, -8
+ shl eax, 1
+
+ movq mm4, mm1
+ movq mm5, mm2
+ MMX_ABS_TWO mm4, mm5, mm6, mm7
+ movq mm7, mm3
+ paddw mm4, mm5
+ MMX_ABS mm7, mm6
+ paddw mm7, mm4 ; 3x4 sum
+
+ movq mm4, [left_1d]
+ movd mm5, eax
+ psllw mm4, 2
+ psubw mm4, mm0
+ psubw mm5, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm2, mm3
+ punpckldq mm0, mm2 ; transpose
+ movq mm1, [top_1d]
+ psllw mm1, 2
+ psubw mm0, mm1
+ MMX_ABS mm4, mm3 ; 1x4 sum
+ MMX_ABS mm5, mm2 ; 1x4 sum
+ MMX_ABS mm0, mm1 ; 4x1 sum
+ paddw mm4, mm7
+ paddw mm5, mm7
+ movq mm1, mm5
+ psrlq mm1, 16 ; 4x3 sum
+ paddw mm0, mm1
+
+ SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
+ movd [parm3q+0], mm0 ; i4x4_v satd
+ movd [parm3q+4], mm4 ; i4x4_h satd
+ movd [parm3q+8], mm5 ; i4x4_dc satd
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+x264_intra_satd_x3_16x16_mmxext:
+%define sums rsp-32 ; +24
+%define top_1d rsp-64 ; +32
+%define left_1d rsp-96 ; +32
+
+ mov qword [sums+0], 0
+ mov qword [sums+8], 0
+ mov qword [sums+16], 0
+
+ ; 1D hadamards
+ xor ecx, ecx
+ mov eax, 12
+.loop_edge:
+ ; left
+ shl eax, 5 ; log(FDEC_STRIDE)
+ movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE]
+ movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE]
+ movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE]
+ movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE]
+ shr eax, 5
+ SCALAR_SUMSUB r8d, r9d, r10d, r11d
+ SCALAR_SUMSUB r8d, r10d, r9d, r11d
+ add ecx, r8d
+ mov [left_1d+2*rax+0], r8w
+ mov [left_1d+2*rax+2], r9w
+ mov [left_1d+2*rax+4], r10w
+ mov [left_1d+2*rax+6], r11w
+
+ ; top
+ movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0]
+ movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1]
+ movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2]
+ movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3]
+ SCALAR_SUMSUB r8d, r9d, r10d, r11d
+ SCALAR_SUMSUB r8d, r10d, r9d, r11d
+ add ecx, r8d
+ mov [top_1d+2*rax+0], r8w
+ mov [top_1d+2*rax+2], r9w
+ mov [top_1d+2*rax+4], r10w
+ mov [top_1d+2*rax+6], r11w
+ sub eax, 4
+ jge .loop_edge
+
+ ; dc
+ shr ecx, 1
+ add ecx, 8
+ and ecx, -16
+
+ ; 2D hadamards
+ xor eax, eax
+.loop_y:
+ xor esi, esi
+.loop_x:
+ call load_hadamard
+
+ movq mm4, mm1
+ movq mm5, mm2
+ MMX_ABS_TWO mm4, mm5, mm6, mm7
+ movq mm7, mm3
+ paddw mm4, mm5
+ MMX_ABS mm7, mm6
+ paddw mm7, mm4 ; 3x4 sum
+
+ movq mm4, [left_1d+8*rax]
+ movd mm5, ecx
+ psllw mm4, 2
+ psubw mm4, mm0
+ psubw mm5, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm2, mm3
+ punpckldq mm0, mm2 ; transpose
+ movq mm1, [top_1d+8*rsi]
+ psllw mm1, 2
+ psubw mm0, mm1
+ MMX_ABS mm4, mm3 ; 1x4 sum
+ MMX_ABS mm5, mm2 ; 1x4 sum
+ MMX_ABS mm0, mm1 ; 4x1 sum
+ pavgw mm4, mm7
+ pavgw mm5, mm7
+ paddw mm0, [sums+0] ; i4x4_v satd
+ paddw mm4, [sums+8] ; i4x4_h satd
+ paddw mm5, [sums+16] ; i4x4_dc satd
+ movq [sums+0], mm0
+ movq [sums+8], mm4
+ movq [sums+16], mm5
+
+ add parm1q, 4
+ inc esi
+ cmp esi, 4
+ jl .loop_x
+ add parm1q, 4*FENC_STRIDE-16
+ inc eax
+ cmp eax, 4
+ jl .loop_y
+
+; horizontal sum
+ movq mm2, [sums+16]
+ movq mm1, [sums+8]
+ movq mm0, [sums+0]
+ movq mm7, mm2
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
+ psrld mm0, 1
+ pslld mm7, 16
+ psrld mm7, 16
+ paddd mm0, mm2
+ psubd mm0, mm7
+ movd [parm3q+8], mm2 ; i16x16_dc satd
+ movd [parm3q+4], mm1 ; i16x16_h satd
+ movd [parm3q+0], mm0 ; i16x16_v satd
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+x264_intra_satd_x3_8x8c_mmxext:
+%define sums rsp-32 ; +24
+%define top_1d rsp-48 ; +16
+%define left_1d rsp-64 ; +16
+
+ mov qword [sums+0], 0
+ mov qword [sums+8], 0
+ mov qword [sums+16], 0
+
+ ; 1D hadamards
+ mov eax, 4
+.loop_edge:
+ ; left
+ shl eax, 5 ; log(FDEC_STRIDE)
+ movzx r8d, byte [parm2q+rax-1+0*FDEC_STRIDE]
+ movzx r9d, byte [parm2q+rax-1+1*FDEC_STRIDE]
+ movzx r10d, byte [parm2q+rax-1+2*FDEC_STRIDE]
+ movzx r11d, byte [parm2q+rax-1+3*FDEC_STRIDE]
+ shr eax, 5
+ SCALAR_SUMSUB r8d, r9d, r10d, r11d
+ SCALAR_SUMSUB r8d, r10d, r9d, r11d
+ mov [left_1d+2*rax+0], r8w
+ mov [left_1d+2*rax+2], r9w
+ mov [left_1d+2*rax+4], r10w
+ mov [left_1d+2*rax+6], r11w
+
+ ; top
+ movzx r8d, byte [parm2q+rax-FDEC_STRIDE+0]
+ movzx r9d, byte [parm2q+rax-FDEC_STRIDE+1]
+ movzx r10d, byte [parm2q+rax-FDEC_STRIDE+2]
+ movzx r11d, byte [parm2q+rax-FDEC_STRIDE+3]
+ SCALAR_SUMSUB r8d, r9d, r10d, r11d
+ SCALAR_SUMSUB r8d, r10d, r9d, r11d
+ mov [top_1d+2*rax+0], r8w
+ mov [top_1d+2*rax+2], r9w
+ mov [top_1d+2*rax+4], r10w
+ mov [top_1d+2*rax+6], r11w
+ sub eax, 4
+ jge .loop_edge
+
+ ; dc
+ movzx r8d, word [left_1d+0]
+ movzx r9d, word [top_1d+0]
+ movzx r10d, word [left_1d+8]
+ movzx r11d, word [top_1d+8]
+ add r8d, r9d
+ lea r9, [r10 + r11]
+ lea r8, [2*r8 + 8]
+ lea r9, [2*r9 + 8]
+ lea r10, [4*r10 + 8]
+ lea r11, [4*r11 + 8]
+ and r8d, -16 ; tl
+ and r9d, -16 ; br
+ and r10d, -16 ; bl
+ and r11d, -16 ; tr
+ shl r9, 16
+ mov r9w, r10w
+ shl r9, 16
+ mov r9w, r11w
+ shl r9, 16
+ mov r9w, r8w
+
+ ; 2D hadamards
+ xor eax, eax
+.loop_y:
+ xor esi, esi
+.loop_x:
+ call load_hadamard
+
+ movq mm4, mm1
+ movq mm5, mm2
+ MMX_ABS_TWO mm4, mm5, mm6, mm7
+ movq mm7, mm3
+ paddw mm4, mm5
+ MMX_ABS mm7, mm6
+ paddw mm7, mm4 ; 3x4 sum
+
+ movq mm4, [left_1d+8*rax]
+ movzx ecx, r9w
+ shr r9, 16
+ movd mm5, ecx
+ psllw mm4, 2
+ psubw mm4, mm0
+ psubw mm5, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm2, mm3
+ punpckldq mm0, mm2 ; transpose
+ movq mm1, [top_1d+8*rsi]
+ psllw mm1, 2
+ psubw mm0, mm1
+ MMX_ABS mm4, mm3 ; 1x4 sum
+ MMX_ABS mm5, mm2 ; 1x4 sum
+ MMX_ABS mm0, mm1 ; 4x1 sum
+ pavgw mm4, mm7
+ pavgw mm5, mm7
+ paddw mm0, [sums+16] ; i4x4_v satd
+ paddw mm4, [sums+8] ; i4x4_h satd
+ paddw mm5, [sums+0] ; i4x4_dc satd
+ movq [sums+16], mm0
+ movq [sums+8], mm4
+ movq [sums+0], mm5
+
+ add parm1q, 4
+ inc esi
+ cmp esi, 2
+ jl .loop_x
+ add parm1q, 4*FENC_STRIDE-8
+ inc eax
+ cmp eax, 2
+ jl .loop_y
+
+; horizontal sum
+ movq mm0, [sums+0]
+ movq mm1, [sums+8]
+ movq mm2, [sums+16]
+ movq mm7, mm0
+ psrlq mm7, 15
+ paddw mm2, mm7
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
+ psrld mm2, 1
+ movd [parm3q+0], mm0 ; i8x8c_dc satd
+ movd [parm3q+4], mm1 ; i8x8c_h satd
+ movd [parm3q+8], mm2 ; i8x8c_v satd
+ ret
SECTION .rodata align=16
pd_0000ffff: times 4 dd 0x0000ffff
+pb_1: times 16 db 1
SECTION .text
cglobal x264_pixel_satd_16x16_sse2
cglobal x264_pixel_sa8d_8x8_sse2
cglobal x264_pixel_sa8d_16x16_sse2
+cglobal x264_intra_sa8d_x3_8x8_sse2
%macro SAD_INC_4x16P_SSE2 0
movdqu xmm1, [rdx]
punpckhqdq xmm5, xmm3 ; next 4x4 rows 1 and 3
%endmacro
+%macro SUM1x8_SSE2 3 ; 01 junk sum
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+ paddusw %3, %1
+%endmacro
+
%macro SUM4x4_SSE2 4 ; 02 13 junk sum
pxor %3, %3
psubw %3, %1
%endmacro
%macro SUM_MM_SSE2 2 ; sum junk
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
- psrlw %1, 1
movdqa %2, %1
psrldq %1, 2
paddusw %1, %2
%endmacro
%macro SATD_END 0
+ psrlw xmm6, 1
SUM_MM_SSE2 xmm6, xmm7
ret
%endmacro
pxor xmm10, xmm10
SUM4x4_TWO_SSE2 xmm0, xmm1, xmm6, xmm2, xmm3, xmm9, xmm10
SUM4x4_TWO_SSE2 xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10
+ psrlw xmm10, 1
SUM_MM_SSE2 xmm10, xmm0
add r8d, eax ; preserve rounding for 16x16
add eax, 1
add eax, 1
shr eax, 1
ret
+
+
+
+%macro LOAD_HADAMARD8 1
+ pxor xmm4, xmm4
+ movq xmm0, [%1+0*FENC_STRIDE]
+ movq xmm7, [%1+1*FENC_STRIDE]
+ movq xmm6, [%1+2*FENC_STRIDE]
+ movq xmm3, [%1+3*FENC_STRIDE]
+ movq xmm5, [%1+4*FENC_STRIDE]
+ movq xmm1, [%1+5*FENC_STRIDE]
+ movq xmm8, [%1+6*FENC_STRIDE]
+ movq xmm2, [%1+7*FENC_STRIDE]
+ punpcklbw xmm0, xmm4
+ punpcklbw xmm7, xmm4
+ punpcklbw xmm6, xmm4
+ punpcklbw xmm3, xmm4
+ punpcklbw xmm5, xmm4
+ punpcklbw xmm1, xmm4
+ punpcklbw xmm8, xmm4
+ punpcklbw xmm2, xmm4
+ HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
+ TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
+ HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+%endmacro
+
+%macro SCALAR_SUMSUB 4
+ add %1, %2
+ add %3, %4
+ add %2, %2
+ add %4, %4
+ sub %2, %1
+ sub %4, %3
+%endmacro
+
+%macro SCALAR_HADAMARD1x8 9 ; 8x tmp, dst
+ SCALAR_SUMSUB %1, %5, %2, %6
+ SCALAR_SUMSUB %3, %7, %4, %8
+ SCALAR_SUMSUB %1, %3, %2, %4
+ SCALAR_SUMSUB %5, %7, %6, %8
+ SCALAR_SUMSUB %1, %2, %3, %4
+ SCALAR_SUMSUB %5, %6, %7, %8
+ mov [%9+0], %1
+ mov [%9+2], %2
+ mov [%9+4], %3
+ mov [%9+6], %4
+ mov [%9+8], %5
+ mov [%9+10], %6
+ mov [%9+12], %7
+ mov [%9+14], %8
+%endmacro
+
+; dest, left, right, src, tmp
+; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
+%macro PRED8x8_LOWPASS 5
+ movq %5, %2
+ pavgb %2, %3
+ pxor %3, %5
+ movq %1, %4
+ pand %3, [pb_1 GLOBAL]
+ psubusb %2, %3
+ pavgb %1, %2
+%endmacro
+
+; output: mm0 = filtered t0..t7
+; assumes topleft is available
+%macro PRED8x8_LOAD_TOP_FILT 1
+ movq mm1, [%1-1]
+ movq mm2, [%1+1]
+ and parm4d, byte 4
+ jne .have_topright
+ mov al, [%1+7]
+ mov ah, al
+ pinsrw mm2, eax, 3
+.have_topright:
+ PRED8x8_LOWPASS mm0, mm1, mm2, [%1], mm7
+%endmacro
+
+%macro PRED8x8_LOAD_LEFT_FILT 10 ; 8x reg, tmp, src
+ movzx %1, byte [%10-1*FDEC_STRIDE]
+ movzx %2, byte [%10+0*FDEC_STRIDE]
+ movzx %3, byte [%10+1*FDEC_STRIDE]
+ movzx %4, byte [%10+2*FDEC_STRIDE]
+ movzx %5, byte [%10+3*FDEC_STRIDE]
+ movzx %6, byte [%10+4*FDEC_STRIDE]
+ movzx %7, byte [%10+5*FDEC_STRIDE]
+ movzx %8, byte [%10+6*FDEC_STRIDE]
+ movzx %9, byte [%10+7*FDEC_STRIDE]
+ lea %1, [%1+%2+1]
+ lea %2, [%2+%3+1]
+ lea %3, [%3+%4+1]
+ lea %4, [%4+%5+1]
+ lea %5, [%5+%6+1]
+ lea %6, [%6+%7+1]
+ lea %7, [%7+%8+1]
+ lea %8, [%8+%9+1]
+ lea %9, [%9+%9+1]
+ add %1, %2
+ add %2, %3
+ add %3, %4
+ add %4, %5
+ add %5, %6
+ add %6, %7
+ add %7, %8
+ add %8, %9
+ shr %1, 2
+ shr %2, 2
+ shr %3, 2
+ shr %4, 2
+ shr %5, 2
+ shr %6, 2
+ shr %7, 2
+ shr %8, 2
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_sa8d_x3_8x8_sse2( uint8_t *fenc, uint8_t *fdec,
+; int *res, int i_neighbors )
+;-----------------------------------------------------------------------------
+x264_intra_sa8d_x3_8x8_sse2:
+%define left_1d rsp-16 ; +16
+%define top_1d rsp-32 ; +16
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ LOAD_HADAMARD8 parm1q
+
+ PRED8x8_LOAD_LEFT_FILT r8, r9, r10, r11, r12, r13, r14, r15, rax, parm2q-1
+ SCALAR_HADAMARD1x8 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d, left_1d
+ mov edi, r8d ; dc
+
+ PRED8x8_LOAD_TOP_FILT parm2q-FDEC_STRIDE
+ movq [top_1d], mm0
+ movzx r8d, byte [top_1d+0]
+ movzx r9d, byte [top_1d+1]
+ movzx r10d, byte [top_1d+2]
+ movzx r11d, byte [top_1d+3]
+ movzx r12d, byte [top_1d+4]
+ movzx r13d, byte [top_1d+5]
+ movzx r14d, byte [top_1d+6]
+ movzx r15d, byte [top_1d+7]
+ SCALAR_HADAMARD1x8 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w, top_1d
+ lea rdi, [rdi + r8 + 8] ; dc
+ and edi, -16
+ shl edi, 2
+
+ pxor xmm15, xmm15
+ movdqa xmm8, xmm2
+ movdqa xmm9, xmm3
+ movdqa xmm10, xmm4
+ movdqa xmm11, xmm5
+ SUM4x4_TWO_SSE2 xmm8, xmm9, xmm12, xmm10, xmm11, xmm13, xmm15
+ movdqa xmm8, xmm6
+ movdqa xmm9, xmm7
+ SUM4x4_SSE2 xmm8, xmm9, xmm10, xmm15
+ movdqa xmm8, xmm1
+ SUM1x8_SSE2 xmm8, xmm10, xmm15
+ movdqa xmm14, xmm15 ; 7x8 sum
+
+ movdqa xmm8, [left_1d] ; left edge
+ movd xmm9, edi
+ psllw xmm8, 3
+ psubw xmm8, xmm0
+ psubw xmm9, xmm0
+ SUM1x8_SSE2 xmm8, xmm10, xmm14
+ SUM1x8_SSE2 xmm9, xmm11, xmm15 ; 1x8 sum
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+ punpcklwd xmm4, xmm5
+ punpcklwd xmm6, xmm7
+ punpckldq xmm0, xmm2
+ punpckldq xmm4, xmm6
+ punpcklqdq xmm0, xmm4 ; transpose
+ movdqa xmm1, [top_1d]
+ movdqa xmm2, xmm15
+ psllw xmm1, 3
+ psrldq xmm2, 2 ; 8x7 sum
+ psubw xmm0, xmm1 ; 8x1 sum
+ SUM1x8_SSE2 xmm0, xmm1, xmm2
+
+ SUM_MM_SSE2 xmm14, xmm3
+ add eax, 2
+ shr eax, 2
+ mov [parm3q+4], eax ; i8x8_h sa8d
+ SUM_MM_SSE2 xmm15, xmm4
+ add eax, 2
+ shr eax, 2
+ mov [parm3q+8], eax ; i8x8_dc sa8d
+ SUM_MM_SSE2 xmm2, xmm5
+ add eax, 2
+ shr eax, 2
+ mov [parm3q+0], eax ; i8x8_v sa8d
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ ret
cglobal x264_pixel_sa8d_16x16_mmxext
cglobal x264_pixel_sa8d_8x8_mmxext
+cglobal x264_intra_satd_x3_4x4_mmxext
+cglobal x264_intra_satd_x3_8x8c_mmxext
+cglobal x264_intra_satd_x3_16x16_mmxext
+cglobal x264_intra_sa8d_x3_8x8_core_mmxext
+
+
%macro SAD_START 0
push ebx
SATD_START
sub esp, 0x70
%define args esp+0x74
-%define spill esp+0x60
+%define spill esp+0x60 ; +16
+%define trans esp+0 ; +96
LOAD_DIFF_4x8P 0
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
movq [spill], mm0
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
- movq [esp+0x00], mm4
- movq [esp+0x08], mm7
- movq [esp+0x10], mm0
- movq [esp+0x18], mm6
+ movq [trans+0x00], mm4
+ movq [trans+0x08], mm7
+ movq [trans+0x10], mm0
+ movq [trans+0x18], mm6
movq mm0, [spill]
TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
- movq [esp+0x20], mm0
- movq [esp+0x28], mm3
- movq [esp+0x30], mm4
- movq [esp+0x38], mm2
+ movq [trans+0x20], mm0
+ movq [trans+0x28], mm3
+ movq [trans+0x30], mm4
+ movq [trans+0x38], mm2
mov eax, [args+4]
mov ecx, [args+12]
movq [spill], mm7
TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7
- movq [esp+0x40], mm0
- movq [esp+0x48], mm3
- movq [esp+0x50], mm7
- movq [esp+0x58], mm2
+ movq [trans+0x40], mm0
+ movq [trans+0x48], mm3
+ movq [trans+0x50], mm7
+ movq [trans+0x58], mm2
movq mm7, [spill]
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
- movq mm5, [esp+0x00]
- movq mm1, [esp+0x08]
- movq mm2, [esp+0x10]
- movq mm3, [esp+0x18]
+ movq mm5, [trans+0x00]
+ movq mm1, [trans+0x08]
+ movq mm2, [trans+0x10]
+ movq mm3, [trans+0x18]
HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
SUM4x8_MM
- movq [esp], mm0
-
- movq mm0, [esp+0x20]
- movq mm1, [esp+0x28]
- movq mm2, [esp+0x30]
- movq mm3, [esp+0x38]
- movq mm4, [esp+0x40]
- movq mm5, [esp+0x48]
- movq mm6, [esp+0x50]
- movq mm7, [esp+0x58]
+ movq [trans], mm0
+
+ movq mm0, [trans+0x20]
+ movq mm1, [trans+0x28]
+ movq mm2, [trans+0x30]
+ movq mm3, [trans+0x38]
+ movq mm4, [trans+0x40]
+ movq mm5, [trans+0x48]
+ movq mm6, [trans+0x50]
+ movq mm7, [trans+0x58]
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
SUM4x8_MM
ret
%undef args
%undef spill
+%undef trans
ALIGN 16
;-----------------------------------------------------------------------------
pop edi
pop esi
ret
+
+
+; in: fenc
+; out: mm0..mm3 = hadamard coefs
+%macro LOAD_HADAMARD 1
+ pxor mm7, mm7
+ movd mm0, [%1+0*FENC_STRIDE]
+ movd mm4, [%1+1*FENC_STRIDE]
+ movd mm3, [%1+2*FENC_STRIDE]
+ movd mm1, [%1+3*FENC_STRIDE]
+ punpcklbw mm0, mm7
+ punpcklbw mm4, mm7
+ punpcklbw mm3, mm7
+ punpcklbw mm1, mm7
+ HADAMARD4x4 mm0, mm4, mm3, mm1
+ TRANSPOSE4x4 mm0, mm4, mm3, mm1, mm2
+ HADAMARD4x4 mm0, mm1, mm2, mm3
+%endmacro
+
+%macro SCALAR_SUMSUB 4
+ add %1, %2
+ add %3, %4
+ add %2, %2
+ add %4, %4
+ sub %2, %1
+ sub %4, %3
+%endmacro
+
+%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
+ pxor %7, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ paddusw %1, %4
+ paddusw %2, %5
+ paddusw %3, %6
+ punpcklwd %1, %7
+ punpcklwd %2, %7
+ punpcklwd %3, %7
+ pshufw %4, %1, 01001110b
+ pshufw %5, %2, 01001110b
+ pshufw %6, %3, 01001110b
+ %8 %1, %4
+ %8 %2, %5
+ %8 %3, %6
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_4x4_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+x264_intra_satd_x3_4x4_mmxext:
+ push ebx
+ push edi
+ push esi
+ sub esp, 16
+%define args esp+32
+%define top_1d esp+8 ; +8
+%define left_1d esp+0 ; +8
+
+ mov eax, [args+0] ; fenc
+ LOAD_HADAMARD eax
+
+ mov edi, [args+4] ; fdec
+ movzx eax, byte [edi-1+0*FDEC_STRIDE]
+ movzx ebx, byte [edi-1+1*FDEC_STRIDE]
+ movzx ecx, byte [edi-1+2*FDEC_STRIDE]
+ movzx edx, byte [edi-1+3*FDEC_STRIDE]
+ SCALAR_SUMSUB eax, ebx, ecx, edx
+ SCALAR_SUMSUB eax, ecx, ebx, edx ; 1x4 hadamard
+ mov [left_1d+0], ax
+ mov [left_1d+2], bx
+ mov [left_1d+4], cx
+ mov [left_1d+6], dx
+ mov esi, eax ; dc
+
+ movzx eax, byte [edi-FDEC_STRIDE+0]
+ movzx ebx, byte [edi-FDEC_STRIDE+1]
+ movzx ecx, byte [edi-FDEC_STRIDE+2]
+ movzx edx, byte [edi-FDEC_STRIDE+3]
+ SCALAR_SUMSUB eax, ebx, ecx, edx
+ SCALAR_SUMSUB eax, ecx, ebx, edx ; 4x1 hadamard
+ mov [top_1d+0], ax
+ mov [top_1d+2], bx
+ mov [top_1d+4], cx
+ mov [top_1d+6], dx
+ lea esi, [esi + eax + 4] ; dc
+ and esi, -8
+ shl esi, 1
+
+ movq mm4, mm1
+ movq mm5, mm2
+ MMX_ABS_TWO mm4, mm5, mm6, mm7
+ movq mm7, mm3
+ paddw mm4, mm5
+ MMX_ABS mm7, mm6
+ paddw mm7, mm4 ; 3x4 sum
+
+ movq mm4, [left_1d]
+ movd mm5, esi
+ psllw mm4, 2
+ psubw mm4, mm0
+ psubw mm5, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm2, mm3
+ punpckldq mm0, mm2 ; transpose
+ movq mm1, [top_1d]
+ psllw mm1, 2
+ psubw mm0, mm1
+ MMX_ABS mm4, mm3 ; 1x4 sum
+ MMX_ABS mm5, mm2 ; 1x4 sum
+ MMX_ABS mm0, mm1 ; 4x1 sum
+ paddw mm4, mm7
+ paddw mm5, mm7
+ movq mm1, mm5
+ psrlq mm1, 16 ; 4x3 sum
+ paddw mm0, mm1
+
+ SUM_MM_X3 mm0, mm4, mm5, mm1, mm2, mm3, mm6, pavgw
+ mov eax, [args+8] ; res
+ movd [eax+0], mm0 ; i4x4_v satd
+ movd [eax+4], mm4 ; i4x4_h satd
+ movd [eax+8], mm5 ; i4x4_dc satd
+
+ add esp, 16
+ pop esi
+ pop edi
+ pop ebx
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_16x16_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+x264_intra_satd_x3_16x16_mmxext:
+ push ebx
+ push ebp
+ push edi
+ push esi
+ sub esp, 88
+%define args esp+108
+%define sums esp+64 ; +24
+%define top_1d esp+32 ; +32
+%define left_1d esp+0 ; +32
+
+ pxor mm0, mm0
+ movq [sums+0], mm0
+ movq [sums+8], mm0
+ movq [sums+16], mm0
+
+ ; 1D hadamards
+ mov edi, [args+4] ; fdec
+ xor ebp, ebp
+ mov esi, 12
+.loop_edge:
+ ; left
+ shl esi, 5 ; log(FDEC_STRIDE)
+ movzx eax, byte [edi+esi-1+0*FDEC_STRIDE]
+ movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE]
+ movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE]
+ movzx edx, byte [edi+esi-1+3*FDEC_STRIDE]
+ shr esi, 5
+ SCALAR_SUMSUB eax, ebx, ecx, edx
+ SCALAR_SUMSUB eax, ecx, ebx, edx
+ add ebp, eax
+ mov [left_1d+2*esi+0], ax
+ mov [left_1d+2*esi+2], bx
+ mov [left_1d+2*esi+4], cx
+ mov [left_1d+2*esi+6], dx
+
+ ; top
+ movzx eax, byte [edi+esi-FDEC_STRIDE+0]
+ movzx ebx, byte [edi+esi-FDEC_STRIDE+1]
+ movzx ecx, byte [edi+esi-FDEC_STRIDE+2]
+ movzx edx, byte [edi+esi-FDEC_STRIDE+3]
+ SCALAR_SUMSUB eax, ebx, ecx, edx
+ SCALAR_SUMSUB eax, ecx, ebx, edx
+ add ebp, eax
+ mov [top_1d+2*esi+0], ax
+ mov [top_1d+2*esi+2], bx
+ mov [top_1d+2*esi+4], cx
+ mov [top_1d+2*esi+6], dx
+ sub esi, 4
+ jge .loop_edge
+
+ ; dc
+ shr ebp, 1
+ add ebp, 8
+ and ebp, -16
+
+ ; 2D hadamards
+ mov eax, [args+0] ; fenc
+ xor edi, edi
+.loop_y:
+ xor esi, esi
+.loop_x:
+ LOAD_HADAMARD eax
+
+ movq mm4, mm1
+ movq mm5, mm2
+ MMX_ABS_TWO mm4, mm5, mm6, mm7
+ movq mm7, mm3
+ paddw mm4, mm5
+ MMX_ABS mm7, mm6
+ paddw mm7, mm4 ; 3x4 sum
+
+ movq mm4, [left_1d+8*edi]
+ movd mm5, ebp
+ psllw mm4, 2
+ psubw mm4, mm0
+ psubw mm5, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm2, mm3
+ punpckldq mm0, mm2 ; transpose
+ movq mm1, [top_1d+8*esi]
+ psllw mm1, 2
+ psubw mm0, mm1
+ MMX_ABS mm4, mm3 ; 1x4 sum
+ MMX_ABS mm5, mm2 ; 1x4 sum
+ MMX_ABS mm0, mm1 ; 4x1 sum
+ pavgw mm4, mm7
+ pavgw mm5, mm7
+ paddw mm0, [sums+0] ; i4x4_v satd
+ paddw mm4, [sums+8] ; i4x4_h satd
+ paddw mm5, [sums+16] ; i4x4_dc satd
+ movq [sums+0], mm0
+ movq [sums+8], mm4
+ movq [sums+16], mm5
+
+ add eax, 4
+ inc esi
+ cmp esi, 4
+ jl .loop_x
+ add eax, 4*FENC_STRIDE-16
+ inc edi
+ cmp edi, 4
+ jl .loop_y
+
+; horizontal sum
+ movq mm2, [sums+16]
+ movq mm0, [sums+0]
+ movq mm1, [sums+8]
+ movq mm7, mm2
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
+ psrld mm0, 1
+ pslld mm7, 16
+ psrld mm7, 16
+ paddd mm0, mm2
+ psubd mm0, mm7
+ mov eax, [args+8] ; res
+ movd [eax+0], mm0 ; i16x16_v satd
+ movd [eax+4], mm1 ; i16x16_h satd
+ movd [eax+8], mm2 ; i16x16_dc satd
+
+ add esp, 88
+ pop esi
+ pop edi
+ pop ebp
+ pop ebx
+ ret
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_satd_x3_8x8c_mmxext( uint8_t *fenc, uint8_t *fdec, int *res )
+;-----------------------------------------------------------------------------
+x264_intra_satd_x3_8x8c_mmxext:
+ push ebx
+ push ebp
+ push edi
+ push esi
+ sub esp, 72
+%define args esp+92
+%define sums esp+48 ; +24
+%define dc_1d esp+32 ; +16
+%define top_1d esp+16 ; +16
+%define left_1d esp+0 ; +16
+
+ pxor mm0, mm0
+ movq [sums+0], mm0
+ movq [sums+8], mm0
+ movq [sums+16], mm0
+
+ ; 1D hadamards
+ mov edi, [args+4] ; fdec
+ xor ebp, ebp
+ mov esi, 12
+.loop_edge:
+ ; left
+ shl esi, 5 ; log(FDEC_STRIDE)
+ movzx eax, byte [edi+esi-1+0*FDEC_STRIDE]
+ movzx ebx, byte [edi+esi-1+1*FDEC_STRIDE]
+ movzx ecx, byte [edi+esi-1+2*FDEC_STRIDE]
+ movzx edx, byte [edi+esi-1+3*FDEC_STRIDE]
+ shr esi, 5
+ SCALAR_SUMSUB eax, ebx, ecx, edx
+ SCALAR_SUMSUB eax, ecx, ebx, edx
+ mov [left_1d+2*esi+0], ax
+ mov [left_1d+2*esi+2], bx
+ mov [left_1d+2*esi+4], cx
+ mov [left_1d+2*esi+6], dx
+
+ ; top
+ movzx eax, byte [edi+esi-FDEC_STRIDE+0]
+ movzx ebx, byte [edi+esi-FDEC_STRIDE+1]
+ movzx ecx, byte [edi+esi-FDEC_STRIDE+2]
+ movzx edx, byte [edi+esi-FDEC_STRIDE+3]
+ SCALAR_SUMSUB eax, ebx, ecx, edx
+ SCALAR_SUMSUB eax, ecx, ebx, edx
+ mov [top_1d+2*esi+0], ax
+ mov [top_1d+2*esi+2], bx
+ mov [top_1d+2*esi+4], cx
+ mov [top_1d+2*esi+6], dx
+ sub esi, 4
+ jge .loop_edge
+
+ ; dc
+ movzx eax, word [left_1d+0]
+ movzx ebx, word [top_1d+0]
+ movzx ecx, word [left_1d+8]
+ movzx edx, word [top_1d+8]
+ add eax, ebx
+ lea ebx, [ecx + edx]
+ lea eax, [2*eax + 8]
+ lea ebx, [2*ebx + 8]
+ lea ecx, [4*ecx + 8]
+ lea edx, [4*edx + 8]
+ and eax, -16
+ and ebx, -16
+ and ecx, -16
+ and edx, -16
+ mov [dc_1d+ 0], eax ; tl
+ mov [dc_1d+ 4], edx ; tr
+ mov [dc_1d+ 8], ecx ; bl
+ mov [dc_1d+12], ebx ; br
+ lea ebp, [dc_1d]
+
+ ; 2D hadamards
+ mov eax, [args+0] ; fenc
+ xor edi, edi
+.loop_y:
+ xor esi, esi
+.loop_x:
+ LOAD_HADAMARD eax
+
+ movq mm4, mm1
+ movq mm5, mm2
+ MMX_ABS_TWO mm4, mm5, mm6, mm7
+ movq mm7, mm3
+ paddw mm4, mm5
+ MMX_ABS mm7, mm6
+ paddw mm7, mm4 ; 3x4 sum
+
+ movq mm4, [left_1d+8*edi]
+ movd mm5, [ebp]
+ psllw mm4, 2
+ psubw mm4, mm0
+ psubw mm5, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm2, mm3
+ punpckldq mm0, mm2 ; transpose
+ movq mm1, [top_1d+8*esi]
+ psllw mm1, 2
+ psubw mm0, mm1
+ MMX_ABS mm4, mm3 ; 1x4 sum
+ MMX_ABS mm5, mm2 ; 1x4 sum
+ MMX_ABS mm0, mm1 ; 4x1 sum
+ pavgw mm4, mm7
+ pavgw mm5, mm7
+ paddw mm0, [sums+16] ; i4x4_v satd
+ paddw mm4, [sums+8] ; i4x4_h satd
+ paddw mm5, [sums+0] ; i4x4_dc satd
+ movq [sums+16], mm0
+ movq [sums+8], mm4
+ movq [sums+0], mm5
+
+ add eax, 4
+ add ebp, 4
+ inc esi
+ cmp esi, 2
+ jl .loop_x
+ add eax, 4*FENC_STRIDE-8
+ inc edi
+ cmp edi, 2
+ jl .loop_y
+
+; horizontal sum
+ movq mm0, [sums+0]
+ movq mm1, [sums+8]
+ movq mm2, [sums+16]
+ movq mm6, mm0
+ psrlq mm6, 15
+ paddw mm2, mm6
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm7, paddd
+ psrld mm2, 1
+ mov eax, [args+8] ; res
+ movd [eax+0], mm0 ; i8x8c_dc satd
+ movd [eax+4], mm1 ; i8x8c_h satd
+ movd [eax+8], mm2 ; i8x8c_v satd
+
+ add esp, 72
+ pop esi
+ pop edi
+ pop ebp
+ pop ebx
+ ret
+
+%macro LOAD_4x8P 1 ; dx
+ pxor mm7, mm7
+ movd mm6, [eax+%1+7*FENC_STRIDE]
+ movd mm0, [eax+%1+0*FENC_STRIDE]
+ movd mm1, [eax+%1+1*FENC_STRIDE]
+ movd mm2, [eax+%1+2*FENC_STRIDE]
+ movd mm3, [eax+%1+3*FENC_STRIDE]
+ movd mm4, [eax+%1+4*FENC_STRIDE]
+ movd mm5, [eax+%1+5*FENC_STRIDE]
+ punpcklbw mm6, mm7
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+ movq [spill], mm6
+ punpcklbw mm2, mm7
+ punpcklbw mm3, mm7
+ movd mm6, [eax+%1+6*FENC_STRIDE]
+ punpcklbw mm4, mm7
+ punpcklbw mm5, mm7
+ punpcklbw mm6, mm7
+ movq mm7, [spill]
+%endmacro
+
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *fenc, int16_t edges[2][8], int *res )
+;-----------------------------------------------------------------------------
+x264_intra_sa8d_x3_8x8_core_mmxext:
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
+ sub esp, 0x70
+%define args esp+0x74
+%define spill esp+0x60 ; +16
+%define trans esp+0 ; +96
+%define sum esp+0 ; +32
+ LOAD_4x8P 0
+ HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm0
+ TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
+ movq [trans+0x00], mm4
+ movq [trans+0x08], mm7
+ movq [trans+0x10], mm0
+ movq [trans+0x18], mm6
+ movq mm0, [spill]
+ TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
+ movq [trans+0x20], mm0
+ movq [trans+0x28], mm3
+ movq [trans+0x30], mm4
+ movq [trans+0x38], mm2
+
+ LOAD_4x8P 4
+ HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movq [spill], mm7
+ TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7
+ movq [trans+0x40], mm0
+ movq [trans+0x48], mm3
+ movq [trans+0x50], mm7
+ movq [trans+0x58], mm2
+ movq mm7, [spill]
+ TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
+ movq mm5, [trans+0x00]
+ movq mm1, [trans+0x08]
+ movq mm2, [trans+0x10]
+ movq mm3, [trans+0x18]
+
+ HADAMARD1x8 mm5, mm1, mm2, mm3, mm4, mm7, mm0, mm6
+
+ movq [spill+0], mm5
+ movq [spill+8], mm7
+ MMX_ABS_TWO mm0, mm1, mm5, mm7
+ MMX_ABS_TWO mm2, mm3, mm5, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm1
+ MMX_ABS_TWO mm4, mm6, mm2, mm3
+ movq mm5, [spill+0]
+ movq mm7, [spill+8]
+ paddw mm0, mm4
+ paddw mm0, mm6
+ MMX_ABS mm7, mm1
+ paddw mm0, mm7 ; 7x4 sum
+ movq mm6, mm5
+ movq mm7, [ecx+8] ; left bottom
+ psllw mm7, 3
+ psubw mm6, mm7
+ MMX_ABS_TWO mm5, mm6, mm2, mm3
+ paddw mm5, mm0
+ paddw mm6, mm0
+ movq [sum+0], mm5 ; dc
+ movq [sum+8], mm6 ; left
+
+ movq mm0, [trans+0x20]
+ movq mm1, [trans+0x28]
+ movq mm2, [trans+0x30]
+ movq mm3, [trans+0x38]
+ movq mm4, [trans+0x40]
+ movq mm5, [trans+0x48]
+ movq mm6, [trans+0x50]
+ movq mm7, [trans+0x58]
+
+ HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
+
+ movd [sum+0x10], mm0
+ movd [sum+0x12], mm1
+ movd [sum+0x14], mm2
+ movd [sum+0x16], mm3
+ movd [sum+0x18], mm4
+ movd [sum+0x1a], mm5
+ movd [sum+0x1c], mm6
+ movd [sum+0x1e], mm7
+
+ movq [spill], mm0
+ movq [spill+8], mm1
+ MMX_ABS_TWO mm2, mm3, mm0, mm1
+ MMX_ABS_TWO mm4, mm5, mm0, mm1
+ paddw mm2, mm3
+ paddw mm4, mm5
+ paddw mm2, mm4
+ movq mm0, [spill]
+ movq mm1, [spill+8]
+ MMX_ABS_TWO mm6, mm7, mm4, mm5
+ MMX_ABS mm1, mm4
+ paddw mm2, mm7
+ paddw mm1, mm6
+ paddw mm2, mm1 ; 7x4 sum
+ movq mm1, mm0
+
+ movq mm7, [ecx+0]
+ psllw mm7, 3 ; left top
+
+ movzx edx, word [ecx+0]
+ add dx, [ecx+16]
+ lea edx, [4*edx+32]
+ and edx, -64
+ movd mm6, edx ; dc
+
+ psubw mm1, mm7
+ psubw mm0, mm6
+ MMX_ABS_TWO mm0, mm1, mm5, mm6
+ movq mm3, [sum+0] ; dc
+ paddw mm0, mm2
+ paddw mm1, mm2
+ movq mm2, mm0
+ paddw mm0, mm3
+ paddw mm1, [sum+8] ; h
+ psrlq mm2, 16
+ paddw mm2, mm3
+
+ movq mm3, [ecx+16] ; top left
+ movq mm4, [ecx+24] ; top right
+ psllw mm3, 3
+ psllw mm4, 3
+ psubw mm3, [sum+16]
+ psubw mm4, [sum+24]
+ MMX_ABS_TWO mm3, mm4, mm5, mm6
+ paddw mm2, mm3
+ paddw mm2, mm4 ; v
+
+ SUM_MM_X3 mm0, mm1, mm2, mm3, mm4, mm5, mm6, paddd
+ mov eax, [args+8]
+ movd ecx, mm2
+ movd edx, mm1
+ add ecx, 2
+ add edx, 2
+ shr ecx, 2
+ shr edx, 2
+ mov [eax+0], ecx ; i8x8_v satd
+ mov [eax+4], edx ; i8x8_h satd
+ movd ecx, mm0
+ add ecx, 2
+ shr ecx, 2
+ mov [eax+8], ecx ; i8x8_dc satd
+
+ add esp, 0x70
+ ret
+%undef args
+%undef spill
+%undef trans
+%undef sum
+
int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
+void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
+void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int *, int );
+void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int *, int );
+void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
+
#endif
#include "common/common.h"
#include "common/clip1.h"
#include "predict.h"
+#include "pixel.h"
extern void predict_16x16_v_mmx( uint8_t *src );
extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
****************************************************************************/
#define PL(y) \
- const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
+ int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
#define PREDICT_8x8_LOAD_LEFT(have_tl) \
- const int l0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(-1,0)) \
+ int l0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(-1,0)) \
+ 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
- UNUSED const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
+ UNUSED int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
#define PT(x) \
- const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+ int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
#define PREDICT_8x8_LOAD_TOP(have_tl) \
- const int t0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(0,-1)) \
+ int t0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(0,-1)) \
+ 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
- UNUSED const int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \
+ UNUSED int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \
+ 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; \
#define PTR(x) \
} else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
#define PREDICT_8x8_LOAD_TOPLEFT \
- const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
+ int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
#define PREDICT_8x8_DC(v) \
int y; \
}
#endif
+#ifdef ARCH_X86
+#define SUMSUB(a,b,c,d,e,f,g,h)\
+ t=a; a+=b; b-=t;\
+ t=c; c+=d; d-=t;\
+ t=e; e+=f; f-=t;\
+ t=g; g+=h; h-=t;
+
+void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *fenc, uint8_t *src, int res[3], int i_neighbor )
+{
+ PREDICT_8x8_LOAD_TOP(1)
+ PREDICT_8x8_LOAD_LEFT(1)
+ int t;
+ int16_t edges[2][8];
+ SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);
+ SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);
+ SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);
+ edges[0][0] = l0;
+ edges[0][1] = l1;
+ edges[0][2] = l2;
+ edges[0][3] = l3;
+ edges[0][4] = l4;
+ edges[0][5] = l5;
+ edges[0][6] = l6;
+ edges[0][7] = l7;
+ SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);
+ SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);
+ SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);
+ edges[1][0] = t0;
+ edges[1][1] = t1;
+ edges[1][2] = t2;
+ edges[1][3] = t3;
+ edges[1][4] = t4;
+ edges[1][5] = t5;
+ edges[1][6] = t6;
+ edges[1][7] = t7;
+ x264_intra_sa8d_x3_8x8_core_mmxext( fenc, edges, res );
+}
+#endif
+
/****************************************************************************
* Exported functions:
****************************************************************************/
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
+ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
#endif
+ pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
+ pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
+ pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
#endif
#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
+ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
}
#endif
/* multiple parallel calls to sad. */
x264_pixel_cmp_x3_t sad_x3[7];
x264_pixel_cmp_x4_t sad_x4[7];
+
+ /* calculate satd of V, H, and DC modes.
+ * may be NULL, in which case just use pred+satd instead. */
+ void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
+ void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] );
+ void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] );
+ void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t *fdec, int res[3], int i_neighbors );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
a->i_satd_i8x8chroma = COST_MAX;
- for( i = 0; i < i_max; i++ )
+ if( i_max == 4 && h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] )
+ {
+ int satdu[4], satdv[4];
+ h->pixf.intra_satd_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
+ h->pixf.intra_satd_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
+ h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
+ h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
+ satdu[I_PRED_CHROMA_P] =
+ h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE, p_srcc[0], FENC_STRIDE );
+ satdv[I_PRED_CHROMA_P] =
+ h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE, p_srcc[1], FENC_STRIDE );
+
+ for( i=0; i<i_max; i++ )
+ {
+ int i_mode = predict_mode[i];
+ int i_satd = satdu[i_mode] + satdv[i_mode]
+ + a->i_lambda * bs_size_ue(i_mode);
+ COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+ }
+ }
+ else
{
- int i_satd;
- int i_mode;
-
- i_mode = predict_mode[i];
+ for( i=0; i<i_max; i++ )
+ {
+ int i_satd;
+ int i_mode = predict_mode[i];
- /* we do the prediction */
- h->predict_8x8c[i_mode]( p_dstc[0] );
- h->predict_8x8c[i_mode]( p_dstc[1] );
+ /* we do the prediction */
+ h->predict_8x8c[i_mode]( p_dstc[0] );
+ h->predict_8x8c[i_mode]( p_dstc[1] );
- /* we calculate the cost */
- i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
- p_srcc[0], FENC_STRIDE ) +
- h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
- p_srcc[1], FENC_STRIDE ) +
- a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
+ /* we calculate the cost */
+ i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dstc[0], FDEC_STRIDE,
+ p_srcc[0], FENC_STRIDE ) +
+ h->pixf.mbcmp[PIXEL_8x8]( p_dstc[1], FDEC_STRIDE,
+ p_srcc[1], FENC_STRIDE ) +
+ a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
- COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+ COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+ }
}
h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
int i, idx;
int i_max;
int predict_mode[9];
+ int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0];
/*---------------- Try all mode and calculate their score ---------------*/
/* 16x16 prediction selection */
predict_16x16_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
- for( i = 0; i < i_max; i++ )
+
+ if( b_merged_satd && i_max == 4 )
{
- int i_satd;
- int i_mode = predict_mode[i];
- h->predict_16x16[i_mode]( p_dst );
+ h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
+ h->predict_16x16[I_PRED_16x16_P]( p_dst );
+ a->i_satd_i16x16_dir[I_PRED_16x16_P] =
+ h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+ for( i=0; i<4; i++ )
+ {
+ int cost = a->i_satd_i16x16_dir[i] += a->i_lambda * bs_size_ue(i);
+ COPY2_IF_LT( a->i_satd_i16x16, cost, a->i_predict16x16, i );
+ }
+ }
+ else
+ {
+ for( i = 0; i < i_max; i++ )
+ {
+ int i_satd;
+ int i_mode = predict_mode[i];
+ h->predict_16x16[i_mode]( p_dst );
- i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
- a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
- COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
- a->i_satd_i16x16_dir[i_mode] = i_satd;
+ i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
+ a->i_lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
+ COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
+ a->i_satd_i16x16_dir[i_mode] = i_satd;
+ }
}
if( h->sh.i_type == SLICE_TYPE_B )
x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8];
int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
+ b_merged_satd = h->pixf.intra_sa8d_x3_8x8 && h->pixf.mbcmp[0] == h->pixf.satd[0];
// FIXME some bias like in i4x4?
if( h->sh.i_type == SLICE_TYPE_B )
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
- for( i = 0; i < i_max; i++ )
+
+ if( b_merged_satd && i_max == 9 )
+ {
+ int satd[3];
+ h->pixf.intra_sa8d_x3_8x8( p_src_by, p_dst_by, satd, h->mb.i_neighbour8[idx] );
+ if( i_pred_mode < 3 )
+ satd[i_pred_mode] -= 3 * a->i_lambda;
+ for( i=2; i>=0; i-- )
+ {
+ int cost = a->i_satd_i8x8_dir[i][idx] = satd[i] + 4 * a->i_lambda;
+ COPY2_IF_LT( i_best, cost, a->i_predict8x8[idx], i );
+ }
+ i = 3;
+ }
+ else
+ i = 0;
+
+ for( ; i<i_max; i++ )
{
int i_satd;
int i_mode = predict_mode[i];
{
int i_cost;
int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
+ b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0];
if( a->b_mbrd )
i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
/* emulate missing topright samples */
*(uint32_t*) &p_dst_by[4 - FDEC_STRIDE] = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
- for( i = 0; i < i_max; i++ )
+ if( b_merged_satd && i_max >= 6 )
+ {
+ int satd[3];
+ h->pixf.intra_satd_x3_4x4( p_src_by, p_dst_by, satd );
+ if( i_pred_mode < 3 )
+ satd[i_pred_mode] -= 3 * a->i_lambda;
+ for( i=2; i>=0; i-- )
+ COPY2_IF_LT( i_best, satd[i] + 4 * a->i_lambda,
+ a->i_predict4x4[idx], i );
+ i = 3;
+ }
+ else
+ i = 0;
+
+ for( ; i<i_max; i++ )
{
int i_satd;
- int i_mode;
+ int i_mode = predict_mode[i];
- i_mode = predict_mode[i];
h->predict_4x4[i_mode]( p_dst_by );
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
- p_src_by, FENC_STRIDE )
+ p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
uint8_t *pix = &pix_buf[8+FDEC_STRIDE - 1];
uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
int intra_penalty = 5 + 10 * b_bidir;
- i_cost_bak = i_bcost;
+ int satds[4], i_icost;
memcpy( pix-FDEC_STRIDE, src-i_stride, 9 );
for( i=0; i<8; i++ )
pix[i*FDEC_STRIDE] = src[i*i_stride];
pix++;
- for( i = I_PRED_CHROMA_DC; i <= I_PRED_CHROMA_P; i++ )
+ if( h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] )
{
- int i_cost;
- h->predict_8x8c[i]( pix );
- i_cost = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ) + intra_penalty;
- i_bcost = X264_MIN( i_bcost, i_cost );
+ h->pixf.intra_satd_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
+ h->predict_8x8c[I_PRED_CHROMA_P]( pix );
+ satds[I_PRED_CHROMA_P] =
+ h->pixf.satd[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
}
- if( i_bcost != i_cost_bak )
+ else
+ {
+ for( i=0; i<4; i++ )
+ {
+ h->predict_8x8c[i]( pix );
+ satds[i] = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
+ }
+ }
+ i_icost = X264_MIN4( satds[0], satds[1], satds[2], satds[3] ) + intra_penalty;
+ if( i_icost < i_bcost )
{
+ i_bcost = i_icost;
if( !b_bidir
&& i_mb_x > 0 && i_mb_x < h->sps->i_mb_width - 1
&& i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1 )
x264_pixel_function_t pixel_c;
x264_pixel_function_t pixel_ref;
x264_pixel_function_t pixel_asm;
+ x264_predict_t predict_16x16[4+3];
+ x264_predict_t predict_8x8c[4+3];
+ x264_predict_t predict_4x4[9+3];
+ x264_predict8x8_t predict_8x8[9+3];
int ret = 0, ok, used_asm;
int i;
x264_pixel_init( 0, &pixel_c );
x264_pixel_init( cpu_ref, &pixel_ref );
x264_pixel_init( cpu_new, &pixel_asm );
+ x264_predict_16x16_init( 0, predict_16x16 );
+ x264_predict_8x8c_init( 0, predict_8x8c );
+ x264_predict_8x8_init( 0, predict_8x8 );
+ x264_predict_4x4_init( 0, predict_4x4 );
#define TEST_PIXEL( name ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
TEST_PIXEL_X(3);
TEST_PIXEL_X(4);
+
+#define TEST_INTRA_SATD( name, pred, satd, ... ) \
+ if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
+ { \
+ int res_c[3], res_asm[3]; \
+ used_asm = 1; \
+ memcpy( buf3, buf2, 1024 ); \
+ for( i=0; i<3; i++ ) \
+ { \
+ pred[i]( buf3+40, ##__VA_ARGS__ ); \
+ res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
+ } \
+ pixel_asm.name( buf1+40, buf3+40, res_asm, ##__VA_ARGS__ ); \
+ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
+ res_c[0], res_c[1], res_c[2], \
+ res_asm[0], res_asm[1], res_asm[2] ); \
+ } \
+ }
+
+ ok = 1; used_asm = 0;
+ TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16] );
+ TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8] );
+ TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4] );
+ TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8],
+ MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT );
+ TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8],
+ MB_LEFT|MB_TOP|MB_TOPLEFT );
+ report( "intra satd_x3 :" );
+
return ret;
}