From: Loren Merritt Date: Tue, 20 Nov 2007 06:07:17 +0000 (+0000) Subject: avoid memory loads that span the border between two cachelines. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d4ebafa5d7db55e0d21c633c25d4835c5b94e3fd;p=libx264 avoid memory loads that span the border between two cachelines. on core2 this makes x264_pixel_sad an average of 2x faster. other intel cpus gain various amounts. amd are unaffected. overall speedup: 1-10%, depending on how much time is spent in fullpel motion estimation. git-svn-id: svn://svn.videolan.org/x264/trunk@696 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/amd64/amd64inc.asm b/common/amd64/amd64inc.asm index 78f8ad9e..b928652b 100644 --- a/common/amd64/amd64inc.asm +++ b/common/amd64/amd64inc.asm @@ -41,6 +41,15 @@ BITS 64 %1: %endmacro +%macro cextern 1 + %ifdef PREFIX + extern _%1 + %define %1 _%1 + %else + extern %1 + %endif +%endmacro + %macro pad 1 %undef %1 %ifdef PREFIX diff --git a/common/amd64/pixel-sse2.asm b/common/amd64/pixel-sse2.asm index 21a28976..21dd10b4 100644 --- a/common/amd64/pixel-sse2.asm +++ b/common/amd64/pixel-sse2.asm @@ -36,6 +36,7 @@ ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 mask_ff: times 16 db 0xff times 16 db 0 +sw_64: dq 64 SECTION .text @@ -51,25 +52,6 @@ SECTION .text HADDD %1, %2 %endmacro -%macro SAD_INC_4x16P_SSE2 0 - movdqu xmm1, [rdx] - movdqu xmm2, [rdx+rcx] - lea rdx, [rdx+2*rcx] - movdqu xmm3, [rdx] - movdqu xmm4, [rdx+rcx] - psadbw xmm1, [rdi] - psadbw xmm2, [rdi+rsi] - lea rdi, [rdi+2*rsi] - psadbw xmm3, [rdi] - psadbw xmm4, [rdi+rsi] - lea rdi, [rdi+2*rsi] - lea rdx, [rdx+2*rcx] - paddw xmm1, xmm2 - paddw xmm3, xmm4 - paddw xmm0, xmm1 - paddw xmm0, xmm3 -%endmacro - %macro SAD_END_SSE2 0 movhlps xmm1, xmm0 paddw xmm0, xmm1 @@ -77,10 +59,11 @@ SECTION .text ret %endmacro +%macro SAD_W16 1 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x16_sse2 +cglobal x264_pixel_sad_16x16_%1 movdqu xmm0, [rdx] movdqu xmm1, [rdx+rcx] lea rdx, [rdx+2*rcx] @@ -147,11 +130,47 @@ cglobal x264_pixel_sad_16x16_sse2 ;----------------------------------------------------------------------------- ; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x8_sse2 - pxor xmm0, xmm0 - SAD_INC_4x16P_SSE2 - SAD_INC_4x16P_SSE2 +cglobal x264_pixel_sad_16x8_%1 + movdqu xmm0, [rdx] + movdqu xmm2, [rdx+rcx] + lea rdx, [rdx+2*rcx] + movdqu xmm3, [rdx] + movdqu xmm4, [rdx+rcx] + psadbw xmm0, [rdi] + psadbw xmm2, [rdi+rsi] + lea rdi, [rdi+2*rsi] + psadbw xmm3, [rdi] + psadbw xmm4, [rdi+rsi] + lea rdi, [rdi+2*rsi] + lea rdx, [rdx+2*rcx] + paddw xmm0, xmm2 + paddw xmm3, xmm4 + paddw xmm0, xmm3 + movdqu xmm1, [rdx] + movdqu xmm2, [rdx+rcx] + lea rdx, [rdx+2*rcx] + movdqu xmm3, [rdx] + movdqu xmm4, [rdx+rcx] + psadbw xmm1, [rdi] + psadbw xmm2, [rdi+rsi] + lea rdi, [rdi+2*rsi] + psadbw xmm3, [rdi] + psadbw xmm4, [rdi+rsi] + lea rdi, [rdi+2*rsi] + lea rdx, [rdx+2*rcx] + paddw xmm1, xmm2 + paddw xmm3, xmm4 + paddw xmm0, xmm1 + paddw xmm0, xmm3 SAD_END_SSE2 +%endmacro + +SAD_W16 sse2 +%ifdef HAVE_SSE3 +%define movdqu lddqu +SAD_W16 sse3 +%undef movdqu +%endif ; sad x3 / x4 @@ -268,8 +287,8 @@ cglobal x264_pixel_sad_16x8_sse2 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- -%macro SAD_X 3 -cglobal x264_pixel_sad_x%1_%2x%3_sse2 +%macro SAD_X 4 +cglobal x264_pixel_sad_x%1_%2x%3_%4 SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 @@ -277,10 +296,224 @@ cglobal x264_pixel_sad_x%1_%2x%3_sse2 SAD_X%1_END %endmacro -SAD_X 3, 16, 16 -SAD_X 3, 16, 8 -SAD_X 4, 16, 16 -SAD_X 4, 16, 8 +SAD_X 3, 16, 16, sse2 +SAD_X 3, 16, 8, sse2 +SAD_X 4, 16, 16, sse2 +SAD_X 4, 16, 8, sse2 + +%ifdef HAVE_SSE3 +%define movdqu lddqu +SAD_X 3, 16, 16, sse3 +SAD_X 3, 16, 8, sse3 +SAD_X 4, 16, 16, sse3 +SAD_X 4, 16, 8, sse3 +%undef movdqu +%endif + + +; Core2 (Conroe) can load unaligned data just as quickly as aligned data... +; unless the unaligned data spans the border between 2 cachelines, in which +; case it's really slow. The exact numbers may differ, but all Intel cpus +; have a large penalty for cacheline splits. +; (8-byte alignment exactly half way between two cachelines is ok though.) +; LDDQU was supposed to fix this, but it only works on Pentium 4. +; So in the split case we load aligned data and explicitly perform the +; alignment between registers. Like on archs that have only aligned loads, +; except complicated by the fact that PALIGNR takes only an immediate, not +; a variable alignment. +; It is also possible to hoist the realignment to the macroblock level (keep +; 2 copies of the reference frame, offset by 32 bytes), but the extra memory +; needed for that method makes it often slower. + +; sad 16x16 costs on Core2: +; good offsets: 49 cycles (50/64 of all mvs) +; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles) +; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles) +; cache or page split with palignr: 57 cycles (ammortized: +2 cycles) + +; computed jump assumes this loop is exactly 64 bytes +%macro SAD16_CACHELINE_LOOP 1 ; alignment +ALIGN 16 +sad_w16_align%1: + movdqa xmm1, [rdx+16] + movdqa xmm2, [rdx+rcx+16] + palignr xmm1, [rdx], %1 + palignr xmm2, [rdx+rcx], %1 + psadbw xmm1, [rdi] + psadbw xmm2, [rdi+rsi] + paddw xmm0, xmm1 + paddw xmm0, xmm2 + lea rdx, [rdx+2*rcx] + lea rdi, [rdi+2*rsi] + dec eax + jg sad_w16_align%1 + ret +%endmacro + +%macro SAD16_CACHELINE_FUNC 1 ; height +cglobal x264_pixel_sad_16x%1_cache64_ssse3 + mov eax, parm3d + and eax, 0x37 + cmp eax, 0x30 + jle x264_pixel_sad_16x%1_sse2 + mov eax, parm3d + and eax, 15 + shl eax, 6 +%ifdef __PIC__ + lea r10, [sad_w16_align1 - 64 GLOBAL] + add r10, rax +%else + lea r10, [sad_w16_align1 - 64 + rax] +%endif + and parm3q, ~15 + mov eax, %1/2 + pxor xmm0, xmm0 + call r10 + SAD_END_SSE2 +%endmacro + +%macro SAD8_CACHELINE_FUNC 1 ; height +cglobal x264_pixel_sad_8x%1_cache64_mmxext + mov eax, parm3d + and eax, 0x3f + cmp eax, 0x38 + jle x264_pixel_sad_8x%1_mmxext + and eax, 7 + shl eax, 3 + movd mm6, [sw_64 GLOBAL] + movd mm7, eax + psubw mm6, mm7 + and parm3q, ~7 + mov eax, %1/2 + pxor mm0, mm0 +.loop: + movq mm1, [parm3q+8] + movq mm2, [parm3q+parm4q+8] + movq mm3, [parm3q] + movq mm4, [parm3q+parm4q] + psllq mm1, mm6 + psllq mm2, mm6 + psrlq mm3, mm7 + psrlq mm4, mm7 + por mm1, mm3 + por mm2, mm4 + psadbw mm1, [parm1q] + psadbw mm2, [parm1q+parm2q] + paddw mm0, mm1 + paddw mm0, mm2 + lea parm3q, [parm3q+2*parm4q] + lea parm1q, [parm1q+2*parm2q] + dec eax + jg .loop + movd eax, mm0 + ret +%endmacro + + +; sad_x3/x4_cache64: check each mv. +; if they're all within a cacheline, use normal sad_x3/x4. +; otherwise, send them individually to sad_cache64. +%macro CHECK_SPLIT 2 ; pix, width + mov eax, %1 + and eax, 0x37|%2 + cmp eax, 0x30|%2 + jg .split +%endmacro + +%macro SADX3_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver +cglobal x264_pixel_sad_x3_%1x%2_cache64_%4 + CHECK_SPLIT parm2d, %1 + CHECK_SPLIT parm3d, %1 + CHECK_SPLIT parm4d, %1 + jmp x264_pixel_sad_x3_%1x%2_%3 +.split: + push parm4q + push parm3q + mov parm3q, parm2q + mov parm2q, FENC_STRIDE + mov parm4q, parm5q + mov parm5q, parm1q + call x264_pixel_sad_%1x%2_cache64_%4 + mov [parm6q], eax + pop parm3q + mov parm1q, parm5q + call x264_pixel_sad_%1x%2_cache64_%4 + mov [parm6q+4], eax + pop parm3q + mov parm1q, parm5q + call x264_pixel_sad_%1x%2_cache64_%4 + mov [parm6q+8], eax + ret +%endmacro + +%macro SADX4_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver +cglobal x264_pixel_sad_x4_%1x%2_cache64_%4 + CHECK_SPLIT parm2d, %1 + CHECK_SPLIT parm3d, %1 + CHECK_SPLIT parm4d, %1 + CHECK_SPLIT parm5d, %1 + jmp x264_pixel_sad_x4_%1x%2_%3 +.split: + mov r11, parm7q + push parm5q + push parm4q + push parm3q + mov parm3q, parm2q + mov parm2q, FENC_STRIDE + mov parm4q, parm6q + mov parm5q, parm1q + call x264_pixel_sad_%1x%2_cache64_%4 + mov [r11], eax + pop parm3q + mov parm1q, parm5q + call x264_pixel_sad_%1x%2_cache64_%4 + mov [r11+4], eax + pop parm3q + mov parm1q, parm5q + call x264_pixel_sad_%1x%2_cache64_%4 + mov [r11+8], eax + pop parm3q + mov parm1q, parm5q + call x264_pixel_sad_%1x%2_cache64_%4 + mov [r11+12], eax + ret +%endmacro + +%macro SADX34_CACHELINE_FUNC 4 + SADX3_CACHELINE_FUNC %1, %2, %3, %4 + SADX4_CACHELINE_FUNC %1, %2, %3, %4 +%endmacro + +cextern x264_pixel_sad_8x16_mmxext +cextern x264_pixel_sad_8x8_mmxext +cextern x264_pixel_sad_8x4_mmxext +cextern x264_pixel_sad_x3_8x16_mmxext +cextern x264_pixel_sad_x3_8x8_mmxext +cextern x264_pixel_sad_x4_8x16_mmxext +cextern x264_pixel_sad_x4_8x8_mmxext + +; instantiate the aligned sads + +SAD8_CACHELINE_FUNC 4 +SAD8_CACHELINE_FUNC 8 +SAD8_CACHELINE_FUNC 16 +SADX34_CACHELINE_FUNC 8, 16, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 8, mmxext, mmxext + +%ifdef HAVE_SSE3 + +SAD16_CACHELINE_FUNC 8 +SAD16_CACHELINE_FUNC 16 +%assign i 1 +%rep 15 +SAD16_CACHELINE_LOOP i +%assign i i+1 +%endrep + +SADX34_CACHELINE_FUNC 16, 16, sse2, ssse3 +SADX34_CACHELINE_FUNC 16, 8, sse2, ssse3 + +%endif ; HAVE_SSE3 ; ssd diff --git a/common/frame.c b/common/frame.c index 8f4577cc..570441bf 100644 --- a/common/frame.c +++ b/common/frame.c @@ -43,6 +43,12 @@ x264_frame_t *x264_frame_new( x264_t *h ) if( h->param.b_interlaced ) i_lines = ( i_lines + 31 ) & -32; + if( h->param.cpu&X264_CPU_CACHELINE_SPLIT ) + { + int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64; + i_stride = (i_stride + align-1) & -align; + } + frame->i_plane = 3; for( i = 0; i < 3; i++ ) { diff --git a/common/i386/i386inc.asm b/common/i386/i386inc.asm index dedfb1f7..e56f5940 100644 --- a/common/i386/i386inc.asm +++ b/common/i386/i386inc.asm @@ -38,6 +38,15 @@ BITS 32 %1: %endmacro +%macro cextern 1 + %ifdef PREFIX + extern _%1 + %define %1 _%1 + %else + extern %1 + %endif +%endmacro + ; Name of the .rodata section. On OS X we cannot use .rodata because NASM ; is unable to compute address offsets outside of .text so we use the .text ; section instead until NASM is fixed. diff --git a/common/i386/pixel-sse2.asm b/common/i386/pixel-sse2.asm index 84509b61..9a849d51 100644 --- a/common/i386/pixel-sse2.asm +++ b/common/i386/pixel-sse2.asm @@ -50,28 +50,8 @@ SECTION .text paddd %1, %2 %endmacro -%macro SAD_INC_4x16P_SSE2 0 - movdqu xmm1, [ecx] - movdqu xmm2, [ecx+edx] - lea ecx, [ecx+2*edx] - movdqu xmm3, [ecx] - movdqu xmm4, [ecx+edx] - psadbw xmm1, [eax] - psadbw xmm2, [eax+ebx] - lea eax, [eax+2*ebx] - psadbw xmm3, [eax] - psadbw xmm4, [eax+ebx] - lea eax, [eax+2*ebx] - lea ecx, [ecx+2*edx] - paddw xmm1, xmm2 - paddw xmm3, xmm4 - paddw xmm0, xmm1 - paddw xmm0, xmm3 -%endmacro - %macro SAD_START_SSE2 0 push ebx - mov eax, [esp+ 8] ; pix1 mov ebx, [esp+12] ; stride1 mov ecx, [esp+16] ; pix2 @@ -79,19 +59,18 @@ SECTION .text %endmacro %macro SAD_END_SSE2 0 - movdqa xmm1, xmm0 - psrldq xmm0, 8 + movhlps xmm1, xmm0 paddw xmm0, xmm1 movd eax, xmm0 - pop ebx ret %endmacro +%macro SAD_W16 1 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x16_sse2 +cglobal x264_pixel_sad_16x16_%1 SAD_START_SSE2 movdqu xmm0, [ecx] movdqu xmm1, [ecx+edx] @@ -159,14 +138,52 @@ cglobal x264_pixel_sad_16x16_sse2 ;----------------------------------------------------------------------------- ; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int ) ;----------------------------------------------------------------------------- -cglobal x264_pixel_sad_16x8_sse2 +cglobal x264_pixel_sad_16x8_%1 SAD_START_SSE2 - pxor xmm0, xmm0 - SAD_INC_4x16P_SSE2 - SAD_INC_4x16P_SSE2 + movdqu xmm0, [ecx] + movdqu xmm2, [ecx+edx] + lea ecx, [ecx+2*edx] + movdqu xmm3, [ecx] + movdqu xmm4, [ecx+edx] + psadbw xmm0, [eax] + psadbw xmm2, [eax+ebx] + lea eax, [eax+2*ebx] + psadbw xmm3, [eax] + psadbw xmm4, [eax+ebx] + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + paddw xmm0, xmm2 + paddw xmm3, xmm4 + paddw xmm0, xmm3 + movdqu xmm1, [ecx] + movdqu xmm2, [ecx+edx] + lea ecx, [ecx+2*edx] + movdqu xmm3, [ecx] + movdqu xmm4, [ecx+edx] + psadbw xmm1, [eax] + psadbw xmm2, [eax+ebx] + lea eax, [eax+2*ebx] + psadbw xmm3, [eax] + psadbw xmm4, [eax+ebx] + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + paddw xmm1, xmm2 + paddw xmm3, xmm4 + paddw xmm0, xmm1 + paddw xmm0, xmm3 SAD_END_SSE2 +%endmacro + +SAD_W16 sse2 +%ifdef HAVE_SSE3 +%define movdqu lddqu +SAD_W16 sse3 +%undef movdqu +%endif +; sad x3 / x4 + %macro SAD_X3_START_1x16P 0 push edi push esi @@ -301,8 +318,8 @@ cglobal x264_pixel_sad_16x8_sse2 ; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, ; uint8_t *pix2, int i_stride, int scores[3] ) ;----------------------------------------------------------------------------- -%macro SAD_X 3 -cglobal x264_pixel_sad_x%1_%2x%3_sse2 +%macro SAD_X 4 +cglobal x264_pixel_sad_x%1_%2x%3_%4 SAD_X%1_2x%2P 1 %rep %3/2-1 SAD_X%1_2x%2P 0 @@ -310,10 +327,323 @@ cglobal x264_pixel_sad_x%1_%2x%3_sse2 SAD_X%1_END %endmacro -SAD_X 3, 16, 16 -SAD_X 3, 16, 8 -SAD_X 4, 16, 16 -SAD_X 4, 16, 8 +SAD_X 3, 16, 16, sse2 +SAD_X 3, 16, 8, sse2 +SAD_X 4, 16, 16, sse2 +SAD_X 4, 16, 8, sse2 + +%ifdef HAVE_SSE3 +%define movdqu lddqu +SAD_X 3, 16, 16, sse3 +SAD_X 3, 16, 8, sse3 +SAD_X 4, 16, 16, sse3 +SAD_X 4, 16, 8, sse3 +%undef movdqu +%endif + + +; Core2 (Conroe) can load unaligned data just as quickly as aligned data... +; unless the unaligned data spans the border between 2 cachelines, in which +; case it's really slow. The exact numbers may differ, but all Intel cpus +; have a large penalty for cacheline splits. +; (8-byte alignment exactly half way between two cachelines is ok though.) +; LDDQU was supposed to fix this, but it only works on Pentium 4. +; So in the split case we load aligned data and explicitly perform the +; alignment between registers. Like on archs that have only aligned loads, +; except complicated by the fact that PALIGNR takes only an immediate, not +; a variable alignment. + +; computed jump assumes this loop is exactly 80 bytes +%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment +ALIGN 16 +sad_w16_align%1_sse2: + movdqa xmm1, [ecx+16] + movdqa xmm2, [ecx+edx+16] + movdqa xmm3, [ecx] + movdqa xmm4, [ecx+edx] + pslldq xmm1, 16-%1 + pslldq xmm2, 16-%1 + psrldq xmm3, %1 + psrldq xmm4, %1 + por xmm1, xmm3 + por xmm2, xmm4 + psadbw xmm1, [eax] + psadbw xmm2, [eax+ebx] + paddw xmm0, xmm1 + paddw xmm0, xmm2 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + dec esi + jg sad_w16_align%1_sse2 + ret +%endmacro + +; computed jump assumes this loop is exactly 64 bytes +%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment +ALIGN 16 +sad_w16_align%1_ssse3: + movdqa xmm1, [ecx+16] + movdqa xmm2, [ecx+edx+16] + palignr xmm1, [ecx], %1 + palignr xmm2, [ecx+edx], %1 + psadbw xmm1, [eax] + psadbw xmm2, [eax+ebx] + paddw xmm0, xmm1 + paddw xmm0, xmm2 + lea eax, [eax+2*ebx] + lea ecx, [ecx+2*edx] + dec esi + jg sad_w16_align%1_ssse3 + ret +%endmacro + +%macro SAD16_CACHELINE_FUNC 2 ; cpu, height +cglobal x264_pixel_sad_16x%2_cache64_%1 + mov eax, [esp+12] + and eax, 0x37 + cmp eax, 0x30 + jle x264_pixel_sad_16x%2_sse2 + mov eax, [esp+12] + push ebx + push edi + push esi + and eax, 15 +%ifidn %1, ssse3 + shl eax, 6 +%else + lea eax, [eax*5] + shl eax, 4 +%endif + picgetgot ebx + lea edi, [sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1) + eax GOT_ebx] + mov eax, [esp+16] + mov ebx, [esp+20] + mov ecx, [esp+24] + mov edx, [esp+28] + and ecx, ~15 + mov esi, %2/2 + pxor xmm0, xmm0 + call edi + pop esi + pop edi + SAD_END_SSE2 +%endmacro + +%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline + mov eax, [esp+12] + and eax, 0x17|%2|(%4>>1) + cmp eax, 0x10|%2|(%4>>1) + jle x264_pixel_sad_%1x%2_mmxext + push ebx + push esi + and eax, 7 + shl eax, 3 + mov ecx, 64 + sub ecx, eax + movd mm7, eax + movd mm6, ecx + mov eax, [esp+12] + mov ebx, [esp+16] + mov ecx, [esp+20] + mov edx, [esp+24] + and ecx, ~7 + mov esi, %3 + pxor mm0, mm0 +%endmacro + +%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline +cglobal x264_pixel_sad_16x%1_cache%2_mmxext + SAD_CACHELINE_START_MMX2 16, %1, %1, %2 +.loop: + movq mm1, [ecx] + movq mm2, [ecx+8] + movq mm3, [ecx+16] + movq mm4, mm2 + psrlq mm1, mm7 + psllq mm2, mm6 + psllq mm3, mm6 + psrlq mm4, mm7 + por mm1, mm2 + por mm3, mm4 + psadbw mm1, [eax] + psadbw mm3, [eax+8] + paddw mm0, mm1 + paddw mm0, mm3 + add ecx, edx + add eax, ebx + dec esi + jg .loop + pop esi + pop ebx + movd eax, mm0 + ret +%endmacro + +%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline +cglobal x264_pixel_sad_8x%1_cache%2_mmxext + SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2 +.loop: + movq mm1, [ecx+8] + movq mm2, [ecx+edx+8] + movq mm3, [ecx] + movq mm4, [ecx+edx] + psllq mm1, mm6 + psllq mm2, mm6 + psrlq mm3, mm7 + psrlq mm4, mm7 + por mm1, mm3 + por mm2, mm4 + psadbw mm1, [eax] + psadbw mm2, [eax+ebx] + paddw mm0, mm1 + paddw mm0, mm2 + lea ecx, [ecx+2*edx] + lea eax, [eax+2*ebx] + dec esi + jg .loop + pop esi + pop ebx + movd eax, mm0 + ret +%endmacro + + +; sad_x3/x4_cache64: check each mv. +; if they're all within a cacheline, use normal sad_x3/x4. +; otherwise, send them individually to sad_cache64. +%macro CHECK_SPLIT 3 ; pix, width, cacheline + mov eax, %1 + and eax, 0x17|%2|(%3>>1) + cmp eax, 0x10|%2|(%3>>1) + jg .split +%endmacro + +%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver +cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5 + CHECK_SPLIT [esp+8], %1, %3 + CHECK_SPLIT [esp+12], %1, %3 + CHECK_SPLIT [esp+16], %1, %3 + jmp x264_pixel_sad_x3_%1x%2_%4 +.split: + push edi + mov edi, [esp+28] + push dword [esp+24] + push dword [esp+16] + push dword 16 + push dword [esp+20] + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+32] + mov [edi], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+36] + mov [edi+4], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [edi+8], eax + add esp, 16 + pop edi + ret +%endmacro + +%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver +cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5 + CHECK_SPLIT [esp+8], %1, %3 + CHECK_SPLIT [esp+12], %1, %3 + CHECK_SPLIT [esp+16], %1, %3 + CHECK_SPLIT [esp+20], %1, %3 + jmp x264_pixel_sad_x4_%1x%2_%4 +.split: + push edi + mov edi, [esp+32] + push dword [esp+28] + push dword [esp+16] + push dword 16 + push dword [esp+20] + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+32] + mov [edi], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+36] + mov [edi+4], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov ecx, [esp+40] + mov [edi+8], eax + mov [esp+8], ecx + call x264_pixel_sad_%1x%2_cache%3_%5 + mov [edi+12], eax + add esp, 16 + pop edi + ret +%endmacro + +%macro SADX34_CACHELINE_FUNC 5 + SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5 + SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5 +%endmacro + +cextern x264_pixel_sad_16x16_mmxext +cextern x264_pixel_sad_16x8_mmxext +cextern x264_pixel_sad_8x16_mmxext +cextern x264_pixel_sad_8x8_mmxext +cextern x264_pixel_sad_8x4_mmxext +cextern x264_pixel_sad_x3_16x16_mmxext +cextern x264_pixel_sad_x3_16x8_mmxext +cextern x264_pixel_sad_x3_8x16_mmxext +cextern x264_pixel_sad_x3_8x8_mmxext +cextern x264_pixel_sad_x4_16x16_mmxext +cextern x264_pixel_sad_x4_16x8_mmxext +cextern x264_pixel_sad_x4_8x16_mmxext +cextern x264_pixel_sad_x4_8x8_mmxext + +; instantiate the aligned sads + +SAD16_CACHELINE_FUNC sse2, 8 +SAD16_CACHELINE_FUNC sse2, 16 +%assign i 1 +%rep 15 +SAD16_CACHELINE_LOOP_SSE2 i +%assign i i+1 +%endrep + +SAD16_CACHELINE_FUNC_MMX2 16, 32 +SAD8_CACHELINE_FUNC_MMX2 4, 32 +SAD8_CACHELINE_FUNC_MMX2 8, 32 +SAD8_CACHELINE_FUNC_MMX2 16, 32 +SAD16_CACHELINE_FUNC_MMX2 8, 64 +SAD16_CACHELINE_FUNC_MMX2 16, 64 +SAD8_CACHELINE_FUNC_MMX2 4, 64 +SAD8_CACHELINE_FUNC_MMX2 8, 64 +SAD8_CACHELINE_FUNC_MMX2 16, 64 +SAD16_CACHELINE_FUNC_MMX2 8, 32 + +SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2 + +%ifdef HAVE_SSE3 + +SAD16_CACHELINE_FUNC ssse3, 8 +SAD16_CACHELINE_FUNC ssse3, 16 +%assign i 1 +%rep 15 +SAD16_CACHELINE_LOOP_SSSE3 i +%assign i i+1 +%endrep + +SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3 +SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3 + +%endif ; HAVE_SSE3 %macro SSD_INC_2x16P_SSE2 0 diff --git a/common/i386/pixel.h b/common/i386/pixel.h index 735120c2..f0ff62bd 100644 --- a/common/i386/pixel.h +++ b/common/i386/pixel.h @@ -24,76 +24,48 @@ #ifndef _I386_PIXEL_H #define _I386_PIXEL_H 1 -int x264_pixel_sad_16x16_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_16x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_8x16_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_8x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int ); - -void x264_pixel_sad_x3_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x3_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x3_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x3_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x3_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x3_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x3_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); - -int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_16x8_mmx( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_8x16_mmx( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_8x8_mmx( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_8x4_mmx( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_4x8_mmx( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_4x4_mmx( uint8_t *, int, uint8_t *, int ); - -int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int ); - -int x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int ); - -int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int ); - -void x264_pixel_sad_x3_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x3_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); -void x264_pixel_sad_x4_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ); - -int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int ); -int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int ); - -int x264_pixel_satd_16x16_sse2( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_16x8_sse2( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int ); - -int x264_pixel_satd_16x16_ssse3( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_16x8_ssse3( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x16_ssse3( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x8_ssse3( uint8_t *, int, uint8_t *, int ); -int x264_pixel_satd_8x4_ssse3( uint8_t *, int, uint8_t *, int ); - -int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int ); - -int x264_pixel_sa8d_16x16_ssse3( uint8_t *, int, uint8_t *, int ); -int x264_pixel_sa8d_8x8_ssse3( uint8_t *, int, uint8_t *, int ); +#define DECL_PIXELS( ret, name, suffix, args ) \ + ret x264_pixel_##name##_16x16_##suffix args;\ + ret x264_pixel_##name##_16x8_##suffix args;\ + ret x264_pixel_##name##_8x16_##suffix args;\ + ret x264_pixel_##name##_8x8_##suffix args;\ + ret x264_pixel_##name##_8x4_##suffix args;\ + ret x264_pixel_##name##_4x8_##suffix args;\ + ret x264_pixel_##name##_4x4_##suffix args;\ + +#define DECL_X1( name, suffix ) \ + DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) ) + +#define DECL_X4( name, suffix ) \ + DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) ) + +DECL_X1( sad, mmxext ) +DECL_X1( sad, sse2 ) +DECL_X1( sad, sse3 ) +DECL_X4( sad, mmxext ) +DECL_X4( sad, sse2 ) +DECL_X4( sad, sse3 ) +DECL_X1( ssd, mmx ) +DECL_X1( ssd, sse2 ) +DECL_X1( satd, mmxext ) +DECL_X1( satd, sse2 ) +DECL_X1( satd, ssse3 ) +DECL_X1( sa8d, mmxext ) +DECL_X1( sa8d, sse2 ) +DECL_X1( sa8d, ssse3 ) +DECL_X1( sad, cache32_mmxext ); +DECL_X1( sad, cache64_mmxext ); +DECL_X1( sad, cache64_sse2 ); +DECL_X1( sad, cache64_ssse3 ); +DECL_X4( sad, cache32_mmxext ); +DECL_X4( sad, cache64_mmxext ); +DECL_X4( sad, cache64_sse2 ); +DECL_X4( sad, cache64_ssse3 ); + +#undef DECL_PIXELS +#undef DECL_X1 +#undef DECL_X4 void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * ); diff --git a/common/pixel.c b/common/pixel.c index 9f42eabe..c4e79d04 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -495,6 +495,33 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext; + + if( cpu&X264_CPU_CACHELINE_SPLIT ) + { + if( cpu&X264_CPU_CACHELINE_32 ) + { + INIT5( sad, _cache32_mmxext ); + INIT4( sad_x3, _cache32_mmxext ); + INIT4( sad_x4, _cache32_mmxext ); + } + else + { + INIT5( sad, _cache64_mmxext ); + INIT4( sad_x3, _cache64_mmxext ); + INIT4( sad_x4, _cache64_mmxext ); + } + } +#else + if( cpu&X264_CPU_CACHELINE_SPLIT ) + { + pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext; + pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext; + pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmxext; + pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext; + pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmxext; + pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext; + pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmxext; + } #endif pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext; pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext; @@ -508,6 +535,15 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT5( satd, _sse2 ); + +#ifdef ARCH_X86 + if( cpu&X264_CPU_CACHELINE_SPLIT ) + { + INIT2( sad, _cache64_sse2 ); + INIT2( sad_x3, _cache64_sse2 ); + INIT2( sad_x4, _cache64_sse2 ); + } +#endif } // these are faster on both Intel and AMD if( cpu&X264_CPU_SSE2 ) @@ -524,6 +560,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } #ifdef HAVE_SSE3 + if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) ) + { + INIT2( sad, _sse3 ); + INIT2( sad_x3, _sse3 ); + INIT2( sad_x4, _sse3 ); + } + if( cpu&X264_CPU_SSSE3 ) { INIT5( satd, _ssse3 ); @@ -531,6 +574,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3; #endif + if( cpu&X264_CPU_CACHELINE_SPLIT ) + { + INIT2( sad, _cache64_ssse3 ); + INIT2( sad_x3, _cache64_ssse3 ); + INIT2( sad_x4, _cache64_ssse3 ); + } } #endif //HAVE_SSE3 #endif //HAVE_MMX diff --git a/tools/checkasm.c b/tools/checkasm.c index beddffc2..d33cfd3c 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -54,13 +54,17 @@ static int check_pixel( int cpu_ref, int cpu_new ) int res_c, res_asm; \ if( pixel_asm.name[i] != pixel_ref.name[i] ) \ { \ - used_asm = 1; \ - res_c = pixel_c.name[i]( buf1, 32, buf2, 16 ); \ - res_asm = pixel_asm.name[i]( buf1, 32, buf2, 16 ); \ - if( res_c != res_asm ) \ + for( j=0; j<64; j++ ) \ { \ - ok = 0; \ - fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ + used_asm = 1; \ + res_c = pixel_c.name[i]( buf1, 32, buf2+j, 16 ); \ + res_asm = pixel_asm.name[i]( buf1, 32, buf2+j, 16 ); \ + if( res_c != res_asm ) \ + { \ + ok = 0; \ + fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \ + break; \ + } \ } \ } \ } \ @@ -77,23 +81,27 @@ static int check_pixel( int cpu_ref, int cpu_new ) int res_c[4]={0}, res_asm[4]={0}; \ if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \ { \ - used_asm = 1; \ - res_c[0] = pixel_c.sad[i]( buf1, 16, buf2, 32 ); \ - res_c[1] = pixel_c.sad[i]( buf1, 16, buf2+30, 32 ); \ - res_c[2] = pixel_c.sad[i]( buf1, 16, buf2+1, 32 ); \ - if(N==4) \ + for( j=0; j<64; j++) \ { \ - res_c[3] = pixel_c.sad[i]( buf1, 16, buf2+99, 32 ); \ - pixel_asm.sad_x4[i]( buf1, buf2, buf2+30, buf2+1, buf2+99, 32, res_asm ); \ - } \ - else \ - pixel_asm.sad_x3[i]( buf1, buf2, buf2+30, buf2+1, 32, res_asm ); \ - if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ - { \ - ok = 0; \ - fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \ - i, res_c[0], res_c[1], res_c[2], res_c[3], \ - res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \ + uint8_t *pix2 = buf2+j; \ + used_asm = 1; \ + res_c[0] = pixel_c.sad[i]( buf1, 16, pix2, 32 ); \ + res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+30, 32 ); \ + res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1, 32 ); \ + if(N==4) \ + { \ + res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+99, 32 ); \ + pixel_asm.sad_x4[i]( buf1, pix2, pix2+30, pix2+1, pix2+99, 32, res_asm ); \ + } \ + else \ + pixel_asm.sad_x3[i]( buf1, pix2, pix2+30, pix2+1, 32, res_asm ); \ + if( memcmp(res_c, res_asm, sizeof(res_c)) ) \ + { \ + ok = 0; \ + fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \ + i, res_c[0], res_c[1], res_c[2], res_c[3], \ + res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \ + } \ } \ } \ } \ @@ -714,6 +722,14 @@ int check_all( int cpu_ref, int cpu_new ) + check_quant( cpu_ref, cpu_new ); } +int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name ) +{ + *cpu_ref = *cpu_new; + *cpu_new |= flags; + fprintf( stderr, "x264: %s\n", name ); + return check_all( *cpu_ref, *cpu_new ); +} + int main(int argc, char *argv[]) { int ret = 0; @@ -738,24 +754,26 @@ int main(int argc, char *argv[]) } #ifdef HAVE_MMX - fprintf( stderr, "x264: MMXEXT against C\n" ); - cpu1 = X264_CPU_MMX | X264_CPU_MMXEXT; - ret = check_all( 0, cpu1 ); - + if( x264_cpu_detect() & X264_CPU_MMXEXT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMXEXT" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "MMXEXT Cache64" ); + cpu1 &= ~X264_CPU_CACHELINE_64; + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32, "MMXEXT Cache32" ); + } if( x264_cpu_detect() & X264_CPU_SSE2 ) { - fprintf( stderr, "\nx264: SSE2 against C\n" ); - cpu0 = cpu1; - cpu1 |= X264_CPU_SSE | X264_CPU_SSE2; - ret |= check_all( cpu0, cpu1 ); - - if( x264_cpu_detect() & X264_CPU_SSSE3 ) - { - fprintf( stderr, "\nx264: SSSE3 against C\n" ); - cpu0 = cpu1; - cpu1 |= X264_CPU_SSE3 | X264_CPU_SSSE3; - ret |= check_all( cpu0, cpu1 ); - } + cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2, "SSE2" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSE2 Cache64" ); + } + if( x264_cpu_detect() & X264_CPU_SSE3 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3, "SSE3" ); + if( x264_cpu_detect() & X264_CPU_SSSE3 ) + { + cpu1 &= ~X264_CPU_CACHELINE_SPLIT; + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); } #elif ARCH_PPC if( x264_cpu_detect() & X264_CPU_ALTIVEC )