%1:
%endmacro
+%macro cextern 1
+ %ifdef PREFIX
+ extern _%1
+ %define %1 _%1
+ %else
+ extern %1
+ %endif
+%endmacro
+
%macro pad 1
%undef %1
%ifdef PREFIX
ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
mask_ff: times 16 db 0xff
times 16 db 0
+sw_64: dq 64
SECTION .text
HADDD %1, %2
%endmacro
-%macro SAD_INC_4x16P_SSE2 0
- movdqu xmm1, [rdx]
- movdqu xmm2, [rdx+rcx]
- lea rdx, [rdx+2*rcx]
- movdqu xmm3, [rdx]
- movdqu xmm4, [rdx+rcx]
- psadbw xmm1, [rdi]
- psadbw xmm2, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- psadbw xmm3, [rdi]
- psadbw xmm4, [rdi+rsi]
- lea rdi, [rdi+2*rsi]
- lea rdx, [rdx+2*rcx]
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm1
- paddw xmm0, xmm3
-%endmacro
-
%macro SAD_END_SSE2 0
movhlps xmm1, xmm0
paddw xmm0, xmm1
ret
%endmacro
+%macro SAD_W16 1
;-----------------------------------------------------------------------------
; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_sse2
+cglobal x264_pixel_sad_16x16_%1
movdqu xmm0, [rdx]
movdqu xmm1, [rdx+rcx]
lea rdx, [rdx+2*rcx]
;-----------------------------------------------------------------------------
; int x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_sse2
- pxor xmm0, xmm0
- SAD_INC_4x16P_SSE2
- SAD_INC_4x16P_SSE2
+cglobal x264_pixel_sad_16x8_%1
+ movdqu xmm0, [rdx]
+ movdqu xmm2, [rdx+rcx]
+ lea rdx, [rdx+2*rcx]
+ movdqu xmm3, [rdx]
+ movdqu xmm4, [rdx+rcx]
+ psadbw xmm0, [rdi]
+ psadbw xmm2, [rdi+rsi]
+ lea rdi, [rdi+2*rsi]
+ psadbw xmm3, [rdi]
+ psadbw xmm4, [rdi+rsi]
+ lea rdi, [rdi+2*rsi]
+ lea rdx, [rdx+2*rcx]
+ paddw xmm0, xmm2
+ paddw xmm3, xmm4
+ paddw xmm0, xmm3
+ movdqu xmm1, [rdx]
+ movdqu xmm2, [rdx+rcx]
+ lea rdx, [rdx+2*rcx]
+ movdqu xmm3, [rdx]
+ movdqu xmm4, [rdx+rcx]
+ psadbw xmm1, [rdi]
+ psadbw xmm2, [rdi+rsi]
+ lea rdi, [rdi+2*rsi]
+ psadbw xmm3, [rdi]
+ psadbw xmm4, [rdi+rsi]
+ lea rdi, [rdi+2*rsi]
+ lea rdx, [rdx+2*rcx]
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm0, xmm1
+ paddw xmm0, xmm3
SAD_END_SSE2
+%endmacro
+
+SAD_W16 sse2
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+SAD_W16 sse3
+%undef movdqu
+%endif
; sad x3 / x4
; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
-%macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_sse2
+%macro SAD_X 4
+cglobal x264_pixel_sad_x%1_%2x%3_%4
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
SAD_X%1_END
%endmacro
-SAD_X 3, 16, 16
-SAD_X 3, 16, 8
-SAD_X 4, 16, 16
-SAD_X 4, 16, 8
+SAD_X 3, 16, 16, sse2
+SAD_X 3, 16, 8, sse2
+SAD_X 4, 16, 16, sse2
+SAD_X 4, 16, 8, sse2
+
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+SAD_X 3, 16, 16, sse3
+SAD_X 3, 16, 8, sse3
+SAD_X 4, 16, 16, sse3
+SAD_X 4, 16, 8, sse3
+%undef movdqu
+%endif
+
+
+; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
+; unless the unaligned data spans the border between 2 cachelines, in which
+; case it's really slow. The exact numbers may differ, but all Intel cpus
+; have a large penalty for cacheline splits.
+; (8-byte alignment exactly half way between two cachelines is ok though.)
+; LDDQU was supposed to fix this, but it only works on Pentium 4.
+; So in the split case we load aligned data and explicitly perform the
+; alignment between registers. Like on archs that have only aligned loads,
+; except complicated by the fact that PALIGNR takes only an immediate, not
+; a variable alignment.
+; It is also possible to hoist the realignment to the macroblock level (keep
+; 2 copies of the reference frame, offset by 32 bytes), but the extra memory
+; needed for that method makes it often slower.
+
+; sad 16x16 costs on Core2:
+; good offsets: 49 cycles (50/64 of all mvs)
+; cacheline split: 234 cycles (14/64 of all mvs. ammortized: +40 cycles)
+; page split: 3600 cycles (14/4096 of all mvs. ammortized: +11.5 cycles)
+; cache or page split with palignr: 57 cycles (ammortized: +2 cycles)
+
+; computed jump assumes this loop is exactly 64 bytes
+%macro SAD16_CACHELINE_LOOP 1 ; alignment
+ALIGN 16
+sad_w16_align%1:
+ movdqa xmm1, [rdx+16]
+ movdqa xmm2, [rdx+rcx+16]
+ palignr xmm1, [rdx], %1
+ palignr xmm2, [rdx+rcx], %1
+ psadbw xmm1, [rdi]
+ psadbw xmm2, [rdi+rsi]
+ paddw xmm0, xmm1
+ paddw xmm0, xmm2
+ lea rdx, [rdx+2*rcx]
+ lea rdi, [rdi+2*rsi]
+ dec eax
+ jg sad_w16_align%1
+ ret
+%endmacro
+
+%macro SAD16_CACHELINE_FUNC 1 ; height
+cglobal x264_pixel_sad_16x%1_cache64_ssse3
+ mov eax, parm3d
+ and eax, 0x37
+ cmp eax, 0x30
+ jle x264_pixel_sad_16x%1_sse2
+ mov eax, parm3d
+ and eax, 15
+ shl eax, 6
+%ifdef __PIC__
+ lea r10, [sad_w16_align1 - 64 GLOBAL]
+ add r10, rax
+%else
+ lea r10, [sad_w16_align1 - 64 + rax]
+%endif
+ and parm3q, ~15
+ mov eax, %1/2
+ pxor xmm0, xmm0
+ call r10
+ SAD_END_SSE2
+%endmacro
+
+%macro SAD8_CACHELINE_FUNC 1 ; height
+cglobal x264_pixel_sad_8x%1_cache64_mmxext
+ mov eax, parm3d
+ and eax, 0x3f
+ cmp eax, 0x38
+ jle x264_pixel_sad_8x%1_mmxext
+ and eax, 7
+ shl eax, 3
+ movd mm6, [sw_64 GLOBAL]
+ movd mm7, eax
+ psubw mm6, mm7
+ and parm3q, ~7
+ mov eax, %1/2
+ pxor mm0, mm0
+.loop:
+ movq mm1, [parm3q+8]
+ movq mm2, [parm3q+parm4q+8]
+ movq mm3, [parm3q]
+ movq mm4, [parm3q+parm4q]
+ psllq mm1, mm6
+ psllq mm2, mm6
+ psrlq mm3, mm7
+ psrlq mm4, mm7
+ por mm1, mm3
+ por mm2, mm4
+ psadbw mm1, [parm1q]
+ psadbw mm2, [parm1q+parm2q]
+ paddw mm0, mm1
+ paddw mm0, mm2
+ lea parm3q, [parm3q+2*parm4q]
+ lea parm1q, [parm1q+2*parm2q]
+ dec eax
+ jg .loop
+ movd eax, mm0
+ ret
+%endmacro
+
+
+; sad_x3/x4_cache64: check each mv.
+; if they're all within a cacheline, use normal sad_x3/x4.
+; otherwise, send them individually to sad_cache64.
+%macro CHECK_SPLIT 2 ; pix, width
+ mov eax, %1
+ and eax, 0x37|%2
+ cmp eax, 0x30|%2
+ jg .split
+%endmacro
+
+%macro SADX3_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver
+cglobal x264_pixel_sad_x3_%1x%2_cache64_%4
+ CHECK_SPLIT parm2d, %1
+ CHECK_SPLIT parm3d, %1
+ CHECK_SPLIT parm4d, %1
+ jmp x264_pixel_sad_x3_%1x%2_%3
+.split:
+ push parm4q
+ push parm3q
+ mov parm3q, parm2q
+ mov parm2q, FENC_STRIDE
+ mov parm4q, parm5q
+ mov parm5q, parm1q
+ call x264_pixel_sad_%1x%2_cache64_%4
+ mov [parm6q], eax
+ pop parm3q
+ mov parm1q, parm5q
+ call x264_pixel_sad_%1x%2_cache64_%4
+ mov [parm6q+4], eax
+ pop parm3q
+ mov parm1q, parm5q
+ call x264_pixel_sad_%1x%2_cache64_%4
+ mov [parm6q+8], eax
+ ret
+%endmacro
+
+%macro SADX4_CACHELINE_FUNC 4 ; width, height, normal_ver, split_ver
+cglobal x264_pixel_sad_x4_%1x%2_cache64_%4
+ CHECK_SPLIT parm2d, %1
+ CHECK_SPLIT parm3d, %1
+ CHECK_SPLIT parm4d, %1
+ CHECK_SPLIT parm5d, %1
+ jmp x264_pixel_sad_x4_%1x%2_%3
+.split:
+ mov r11, parm7q
+ push parm5q
+ push parm4q
+ push parm3q
+ mov parm3q, parm2q
+ mov parm2q, FENC_STRIDE
+ mov parm4q, parm6q
+ mov parm5q, parm1q
+ call x264_pixel_sad_%1x%2_cache64_%4
+ mov [r11], eax
+ pop parm3q
+ mov parm1q, parm5q
+ call x264_pixel_sad_%1x%2_cache64_%4
+ mov [r11+4], eax
+ pop parm3q
+ mov parm1q, parm5q
+ call x264_pixel_sad_%1x%2_cache64_%4
+ mov [r11+8], eax
+ pop parm3q
+ mov parm1q, parm5q
+ call x264_pixel_sad_%1x%2_cache64_%4
+ mov [r11+12], eax
+ ret
+%endmacro
+
+%macro SADX34_CACHELINE_FUNC 4
+ SADX3_CACHELINE_FUNC %1, %2, %3, %4
+ SADX4_CACHELINE_FUNC %1, %2, %3, %4
+%endmacro
+
+cextern x264_pixel_sad_8x16_mmxext
+cextern x264_pixel_sad_8x8_mmxext
+cextern x264_pixel_sad_8x4_mmxext
+cextern x264_pixel_sad_x3_8x16_mmxext
+cextern x264_pixel_sad_x3_8x8_mmxext
+cextern x264_pixel_sad_x4_8x16_mmxext
+cextern x264_pixel_sad_x4_8x8_mmxext
+
+; instantiate the aligned sads
+
+SAD8_CACHELINE_FUNC 4
+SAD8_CACHELINE_FUNC 8
+SAD8_CACHELINE_FUNC 16
+SADX34_CACHELINE_FUNC 8, 16, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 8, mmxext, mmxext
+
+%ifdef HAVE_SSE3
+
+SAD16_CACHELINE_FUNC 8
+SAD16_CACHELINE_FUNC 16
+%assign i 1
+%rep 15
+SAD16_CACHELINE_LOOP i
+%assign i i+1
+%endrep
+
+SADX34_CACHELINE_FUNC 16, 16, sse2, ssse3
+SADX34_CACHELINE_FUNC 16, 8, sse2, ssse3
+
+%endif ; HAVE_SSE3
; ssd
if( h->param.b_interlaced )
i_lines = ( i_lines + 31 ) & -32;
+ if( h->param.cpu&X264_CPU_CACHELINE_SPLIT )
+ {
+ int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64;
+ i_stride = (i_stride + align-1) & -align;
+ }
+
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
{
%1:
%endmacro
+%macro cextern 1
+ %ifdef PREFIX
+ extern _%1
+ %define %1 _%1
+ %else
+ extern %1
+ %endif
+%endmacro
+
; Name of the .rodata section. On OS X we cannot use .rodata because NASM
; is unable to compute address offsets outside of .text so we use the .text
; section instead until NASM is fixed.
paddd %1, %2
%endmacro
-%macro SAD_INC_4x16P_SSE2 0
- movdqu xmm1, [ecx]
- movdqu xmm2, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movdqu xmm3, [ecx]
- movdqu xmm4, [ecx+edx]
- psadbw xmm1, [eax]
- psadbw xmm2, [eax+ebx]
- lea eax, [eax+2*ebx]
- psadbw xmm3, [eax]
- psadbw xmm4, [eax+ebx]
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- paddw xmm1, xmm2
- paddw xmm3, xmm4
- paddw xmm0, xmm1
- paddw xmm0, xmm3
-%endmacro
-
%macro SAD_START_SSE2 0
push ebx
-
mov eax, [esp+ 8] ; pix1
mov ebx, [esp+12] ; stride1
mov ecx, [esp+16] ; pix2
%endmacro
%macro SAD_END_SSE2 0
- movdqa xmm1, xmm0
- psrldq xmm0, 8
+ movhlps xmm1, xmm0
paddw xmm0, xmm1
movd eax, xmm0
-
pop ebx
ret
%endmacro
+%macro SAD_W16 1
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_sse2
+cglobal x264_pixel_sad_16x16_%1
SAD_START_SSE2
movdqu xmm0, [ecx]
movdqu xmm1, [ecx+edx]
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_sad_16x8_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x8_sse2
+cglobal x264_pixel_sad_16x8_%1
SAD_START_SSE2
- pxor xmm0, xmm0
- SAD_INC_4x16P_SSE2
- SAD_INC_4x16P_SSE2
+ movdqu xmm0, [ecx]
+ movdqu xmm2, [ecx+edx]
+ lea ecx, [ecx+2*edx]
+ movdqu xmm3, [ecx]
+ movdqu xmm4, [ecx+edx]
+ psadbw xmm0, [eax]
+ psadbw xmm2, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ psadbw xmm3, [eax]
+ psadbw xmm4, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ paddw xmm0, xmm2
+ paddw xmm3, xmm4
+ paddw xmm0, xmm3
+ movdqu xmm1, [ecx]
+ movdqu xmm2, [ecx+edx]
+ lea ecx, [ecx+2*edx]
+ movdqu xmm3, [ecx]
+ movdqu xmm4, [ecx+edx]
+ psadbw xmm1, [eax]
+ psadbw xmm2, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ psadbw xmm3, [eax]
+ psadbw xmm4, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ paddw xmm1, xmm2
+ paddw xmm3, xmm4
+ paddw xmm0, xmm1
+ paddw xmm0, xmm3
SAD_END_SSE2
+%endmacro
+
+SAD_W16 sse2
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+SAD_W16 sse3
+%undef movdqu
+%endif
+; sad x3 / x4
+
%macro SAD_X3_START_1x16P 0
push edi
push esi
; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
-%macro SAD_X 3
-cglobal x264_pixel_sad_x%1_%2x%3_sse2
+%macro SAD_X 4
+cglobal x264_pixel_sad_x%1_%2x%3_%4
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
SAD_X%1_END
%endmacro
-SAD_X 3, 16, 16
-SAD_X 3, 16, 8
-SAD_X 4, 16, 16
-SAD_X 4, 16, 8
+SAD_X 3, 16, 16, sse2
+SAD_X 3, 16, 8, sse2
+SAD_X 4, 16, 16, sse2
+SAD_X 4, 16, 8, sse2
+
+%ifdef HAVE_SSE3
+%define movdqu lddqu
+SAD_X 3, 16, 16, sse3
+SAD_X 3, 16, 8, sse3
+SAD_X 4, 16, 16, sse3
+SAD_X 4, 16, 8, sse3
+%undef movdqu
+%endif
+
+
+; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
+; unless the unaligned data spans the border between 2 cachelines, in which
+; case it's really slow. The exact numbers may differ, but all Intel cpus
+; have a large penalty for cacheline splits.
+; (8-byte alignment exactly half way between two cachelines is ok though.)
+; LDDQU was supposed to fix this, but it only works on Pentium 4.
+; So in the split case we load aligned data and explicitly perform the
+; alignment between registers. Like on archs that have only aligned loads,
+; except complicated by the fact that PALIGNR takes only an immediate, not
+; a variable alignment.
+
+; computed jump assumes this loop is exactly 80 bytes
+%macro SAD16_CACHELINE_LOOP_SSE2 1 ; alignment
+ALIGN 16
+sad_w16_align%1_sse2:
+ movdqa xmm1, [ecx+16]
+ movdqa xmm2, [ecx+edx+16]
+ movdqa xmm3, [ecx]
+ movdqa xmm4, [ecx+edx]
+ pslldq xmm1, 16-%1
+ pslldq xmm2, 16-%1
+ psrldq xmm3, %1
+ psrldq xmm4, %1
+ por xmm1, xmm3
+ por xmm2, xmm4
+ psadbw xmm1, [eax]
+ psadbw xmm2, [eax+ebx]
+ paddw xmm0, xmm1
+ paddw xmm0, xmm2
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ dec esi
+ jg sad_w16_align%1_sse2
+ ret
+%endmacro
+
+; computed jump assumes this loop is exactly 64 bytes
+%macro SAD16_CACHELINE_LOOP_SSSE3 1 ; alignment
+ALIGN 16
+sad_w16_align%1_ssse3:
+ movdqa xmm1, [ecx+16]
+ movdqa xmm2, [ecx+edx+16]
+ palignr xmm1, [ecx], %1
+ palignr xmm2, [ecx+edx], %1
+ psadbw xmm1, [eax]
+ psadbw xmm2, [eax+ebx]
+ paddw xmm0, xmm1
+ paddw xmm0, xmm2
+ lea eax, [eax+2*ebx]
+ lea ecx, [ecx+2*edx]
+ dec esi
+ jg sad_w16_align%1_ssse3
+ ret
+%endmacro
+
+%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
+cglobal x264_pixel_sad_16x%2_cache64_%1
+ mov eax, [esp+12]
+ and eax, 0x37
+ cmp eax, 0x30
+ jle x264_pixel_sad_16x%2_sse2
+ mov eax, [esp+12]
+ push ebx
+ push edi
+ push esi
+ and eax, 15
+%ifidn %1, ssse3
+ shl eax, 6
+%else
+ lea eax, [eax*5]
+ shl eax, 4
+%endif
+ picgetgot ebx
+ lea edi, [sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1) + eax GOT_ebx]
+ mov eax, [esp+16]
+ mov ebx, [esp+20]
+ mov ecx, [esp+24]
+ mov edx, [esp+28]
+ and ecx, ~15
+ mov esi, %2/2
+ pxor xmm0, xmm0
+ call edi
+ pop esi
+ pop edi
+ SAD_END_SSE2
+%endmacro
+
+%macro SAD_CACHELINE_START_MMX2 4 ; width, height, iterations, cacheline
+ mov eax, [esp+12]
+ and eax, 0x17|%2|(%4>>1)
+ cmp eax, 0x10|%2|(%4>>1)
+ jle x264_pixel_sad_%1x%2_mmxext
+ push ebx
+ push esi
+ and eax, 7
+ shl eax, 3
+ mov ecx, 64
+ sub ecx, eax
+ movd mm7, eax
+ movd mm6, ecx
+ mov eax, [esp+12]
+ mov ebx, [esp+16]
+ mov ecx, [esp+20]
+ mov edx, [esp+24]
+ and ecx, ~7
+ mov esi, %3
+ pxor mm0, mm0
+%endmacro
+
+%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
+cglobal x264_pixel_sad_16x%1_cache%2_mmxext
+ SAD_CACHELINE_START_MMX2 16, %1, %1, %2
+.loop:
+ movq mm1, [ecx]
+ movq mm2, [ecx+8]
+ movq mm3, [ecx+16]
+ movq mm4, mm2
+ psrlq mm1, mm7
+ psllq mm2, mm6
+ psllq mm3, mm6
+ psrlq mm4, mm7
+ por mm1, mm2
+ por mm3, mm4
+ psadbw mm1, [eax]
+ psadbw mm3, [eax+8]
+ paddw mm0, mm1
+ paddw mm0, mm3
+ add ecx, edx
+ add eax, ebx
+ dec esi
+ jg .loop
+ pop esi
+ pop ebx
+ movd eax, mm0
+ ret
+%endmacro
+
+%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
+cglobal x264_pixel_sad_8x%1_cache%2_mmxext
+ SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
+.loop:
+ movq mm1, [ecx+8]
+ movq mm2, [ecx+edx+8]
+ movq mm3, [ecx]
+ movq mm4, [ecx+edx]
+ psllq mm1, mm6
+ psllq mm2, mm6
+ psrlq mm3, mm7
+ psrlq mm4, mm7
+ por mm1, mm3
+ por mm2, mm4
+ psadbw mm1, [eax]
+ psadbw mm2, [eax+ebx]
+ paddw mm0, mm1
+ paddw mm0, mm2
+ lea ecx, [ecx+2*edx]
+ lea eax, [eax+2*ebx]
+ dec esi
+ jg .loop
+ pop esi
+ pop ebx
+ movd eax, mm0
+ ret
+%endmacro
+
+
+; sad_x3/x4_cache64: check each mv.
+; if they're all within a cacheline, use normal sad_x3/x4.
+; otherwise, send them individually to sad_cache64.
+%macro CHECK_SPLIT 3 ; pix, width, cacheline
+ mov eax, %1
+ and eax, 0x17|%2|(%3>>1)
+ cmp eax, 0x10|%2|(%3>>1)
+ jg .split
+%endmacro
+
+%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
+cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
+ CHECK_SPLIT [esp+8], %1, %3
+ CHECK_SPLIT [esp+12], %1, %3
+ CHECK_SPLIT [esp+16], %1, %3
+ jmp x264_pixel_sad_x3_%1x%2_%4
+.split:
+ push edi
+ mov edi, [esp+28]
+ push dword [esp+24]
+ push dword [esp+16]
+ push dword 16
+ push dword [esp+20]
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+32]
+ mov [edi], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+36]
+ mov [edi+4], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [edi+8], eax
+ add esp, 16
+ pop edi
+ ret
+%endmacro
+
+%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
+cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
+ CHECK_SPLIT [esp+8], %1, %3
+ CHECK_SPLIT [esp+12], %1, %3
+ CHECK_SPLIT [esp+16], %1, %3
+ CHECK_SPLIT [esp+20], %1, %3
+ jmp x264_pixel_sad_x4_%1x%2_%4
+.split:
+ push edi
+ mov edi, [esp+32]
+ push dword [esp+28]
+ push dword [esp+16]
+ push dword 16
+ push dword [esp+20]
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+32]
+ mov [edi], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+36]
+ mov [edi+4], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov ecx, [esp+40]
+ mov [edi+8], eax
+ mov [esp+8], ecx
+ call x264_pixel_sad_%1x%2_cache%3_%5
+ mov [edi+12], eax
+ add esp, 16
+ pop edi
+ ret
+%endmacro
+
+%macro SADX34_CACHELINE_FUNC 5
+ SADX3_CACHELINE_FUNC %1, %2, %3, %4, %5
+ SADX4_CACHELINE_FUNC %1, %2, %3, %4, %5
+%endmacro
+
+cextern x264_pixel_sad_16x16_mmxext
+cextern x264_pixel_sad_16x8_mmxext
+cextern x264_pixel_sad_8x16_mmxext
+cextern x264_pixel_sad_8x8_mmxext
+cextern x264_pixel_sad_8x4_mmxext
+cextern x264_pixel_sad_x3_16x16_mmxext
+cextern x264_pixel_sad_x3_16x8_mmxext
+cextern x264_pixel_sad_x3_8x16_mmxext
+cextern x264_pixel_sad_x3_8x8_mmxext
+cextern x264_pixel_sad_x4_16x16_mmxext
+cextern x264_pixel_sad_x4_16x8_mmxext
+cextern x264_pixel_sad_x4_8x16_mmxext
+cextern x264_pixel_sad_x4_8x8_mmxext
+
+; instantiate the aligned sads
+
+SAD16_CACHELINE_FUNC sse2, 8
+SAD16_CACHELINE_FUNC sse2, 16
+%assign i 1
+%rep 15
+SAD16_CACHELINE_LOOP_SSE2 i
+%assign i i+1
+%endrep
+
+SAD16_CACHELINE_FUNC_MMX2 16, 32
+SAD8_CACHELINE_FUNC_MMX2 4, 32
+SAD8_CACHELINE_FUNC_MMX2 8, 32
+SAD8_CACHELINE_FUNC_MMX2 16, 32
+SAD16_CACHELINE_FUNC_MMX2 8, 64
+SAD16_CACHELINE_FUNC_MMX2 16, 64
+SAD8_CACHELINE_FUNC_MMX2 4, 64
+SAD8_CACHELINE_FUNC_MMX2 8, 64
+SAD8_CACHELINE_FUNC_MMX2 16, 64
+SAD16_CACHELINE_FUNC_MMX2 8, 32
+
+SADX34_CACHELINE_FUNC 16, 16, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 8, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 16, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 8, 32, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 8, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 16, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 8, 8, 64, mmxext, mmxext
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
+SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
+
+%ifdef HAVE_SSE3
+
+SAD16_CACHELINE_FUNC ssse3, 8
+SAD16_CACHELINE_FUNC ssse3, 16
+%assign i 1
+%rep 15
+SAD16_CACHELINE_LOOP_SSSE3 i
+%assign i i+1
+%endrep
+
+SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
+SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3
+
+%endif ; HAVE_SSE3
%macro SSD_INC_2x16P_SSE2 0
#ifndef _I386_PIXEL_H
#define _I386_PIXEL_H 1
-int x264_pixel_sad_16x16_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_16x8_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_8x16_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_8x8_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_8x4_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_4x8_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_4x4_mmxext( uint8_t *, int, uint8_t *, int );
-
-void x264_pixel_sad_x3_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x3_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x3_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x3_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x3_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x3_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x3_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_16x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_16x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_8x16_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_8x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_8x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_4x8_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_4x4_mmxext( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-
-int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int );
-int x264_pixel_ssd_16x8_mmx( uint8_t *, int, uint8_t *, int );
-int x264_pixel_ssd_8x16_mmx( uint8_t *, int, uint8_t *, int );
-int x264_pixel_ssd_8x8_mmx( uint8_t *, int, uint8_t *, int );
-int x264_pixel_ssd_8x4_mmx( uint8_t *, int, uint8_t *, int );
-int x264_pixel_ssd_4x8_mmx( uint8_t *, int, uint8_t *, int );
-int x264_pixel_ssd_4x4_mmx( uint8_t *, int, uint8_t *, int );
-
-int x264_pixel_satd_16x16_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_16x8_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x16_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x8_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x4_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_4x8_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_4x4_mmxext( uint8_t *, int, uint8_t *, int );
-
-int x264_pixel_sa8d_16x16_mmxext( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sa8d_8x8_mmxext( uint8_t *, int, uint8_t *, int );
-
-int x264_pixel_sad_16x16_sse2( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sad_16x8_sse2( uint8_t *, int, uint8_t *, int );
-
-void x264_pixel_sad_x3_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x3_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_16x16_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-void x264_pixel_sad_x4_16x8_sse2( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * );
-
-int x264_pixel_ssd_16x16_sse2( uint8_t *, int, uint8_t *, int );
-int x264_pixel_ssd_16x8_sse2( uint8_t *, int, uint8_t *, int );
-
-int x264_pixel_satd_16x16_sse2( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_16x8_sse2( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x16_sse2( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x8_sse2( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x4_sse2( uint8_t *, int, uint8_t *, int );
-
-int x264_pixel_satd_16x16_ssse3( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_16x8_ssse3( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x16_ssse3( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x8_ssse3( uint8_t *, int, uint8_t *, int );
-int x264_pixel_satd_8x4_ssse3( uint8_t *, int, uint8_t *, int );
-
-int x264_pixel_sa8d_16x16_sse2( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int );
-
-int x264_pixel_sa8d_16x16_ssse3( uint8_t *, int, uint8_t *, int );
-int x264_pixel_sa8d_8x8_ssse3( uint8_t *, int, uint8_t *, int );
+#define DECL_PIXELS( ret, name, suffix, args ) \
+ ret x264_pixel_##name##_16x16_##suffix args;\
+ ret x264_pixel_##name##_16x8_##suffix args;\
+ ret x264_pixel_##name##_8x16_##suffix args;\
+ ret x264_pixel_##name##_8x8_##suffix args;\
+ ret x264_pixel_##name##_8x4_##suffix args;\
+ ret x264_pixel_##name##_4x8_##suffix args;\
+ ret x264_pixel_##name##_4x4_##suffix args;\
+
+#define DECL_X1( name, suffix ) \
+ DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
+
+#define DECL_X4( name, suffix ) \
+ DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\
+ DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )
+
+DECL_X1( sad, mmxext )
+DECL_X1( sad, sse2 )
+DECL_X1( sad, sse3 )
+DECL_X4( sad, mmxext )
+DECL_X4( sad, sse2 )
+DECL_X4( sad, sse3 )
+DECL_X1( ssd, mmx )
+DECL_X1( ssd, sse2 )
+DECL_X1( satd, mmxext )
+DECL_X1( satd, sse2 )
+DECL_X1( satd, ssse3 )
+DECL_X1( sa8d, mmxext )
+DECL_X1( sa8d, sse2 )
+DECL_X1( sa8d, ssse3 )
+DECL_X1( sad, cache32_mmxext );
+DECL_X1( sad, cache64_mmxext );
+DECL_X1( sad, cache64_sse2 );
+DECL_X1( sad, cache64_ssse3 );
+DECL_X4( sad, cache32_mmxext );
+DECL_X4( sad, cache64_mmxext );
+DECL_X4( sad, cache64_sse2 );
+DECL_X4( sad, cache64_ssse3 );
+
+#undef DECL_PIXELS
+#undef DECL_X1
+#undef DECL_X4
void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
+
+ if( cpu&X264_CPU_CACHELINE_SPLIT )
+ {
+ if( cpu&X264_CPU_CACHELINE_32 )
+ {
+ INIT5( sad, _cache32_mmxext );
+ INIT4( sad_x3, _cache32_mmxext );
+ INIT4( sad_x4, _cache32_mmxext );
+ }
+ else
+ {
+ INIT5( sad, _cache64_mmxext );
+ INIT4( sad_x3, _cache64_mmxext );
+ INIT4( sad_x4, _cache64_mmxext );
+ }
+ }
+#else
+ if( cpu&X264_CPU_CACHELINE_SPLIT )
+ {
+ pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
+ pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext;
+ pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmxext;
+ pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext;
+ pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmxext;
+ pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext;
+ pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmxext;
+ }
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT5( satd, _sse2 );
+
+#ifdef ARCH_X86
+ if( cpu&X264_CPU_CACHELINE_SPLIT )
+ {
+ INIT2( sad, _cache64_sse2 );
+ INIT2( sad_x3, _cache64_sse2 );
+ INIT2( sad_x4, _cache64_sse2 );
+ }
+#endif
}
// these are faster on both Intel and AMD
if( cpu&X264_CPU_SSE2 )
}
#ifdef HAVE_SSE3
+ if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) )
+ {
+ INIT2( sad, _sse3 );
+ INIT2( sad_x3, _sse3 );
+ INIT2( sad_x4, _sse3 );
+ }
+
if( cpu&X264_CPU_SSSE3 )
{
INIT5( satd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
#endif
+ if( cpu&X264_CPU_CACHELINE_SPLIT )
+ {
+ INIT2( sad, _cache64_ssse3 );
+ INIT2( sad_x3, _cache64_ssse3 );
+ INIT2( sad_x4, _cache64_ssse3 );
+ }
}
#endif //HAVE_SSE3
#endif //HAVE_MMX
int res_c, res_asm; \
if( pixel_asm.name[i] != pixel_ref.name[i] ) \
{ \
- used_asm = 1; \
- res_c = pixel_c.name[i]( buf1, 32, buf2, 16 ); \
- res_asm = pixel_asm.name[i]( buf1, 32, buf2, 16 ); \
- if( res_c != res_asm ) \
+ for( j=0; j<64; j++ ) \
{ \
- ok = 0; \
- fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+ used_asm = 1; \
+ res_c = pixel_c.name[i]( buf1, 32, buf2+j, 16 ); \
+ res_asm = pixel_asm.name[i]( buf1, 32, buf2+j, 16 ); \
+ if( res_c != res_asm ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
+ break; \
+ } \
} \
} \
} \
int res_c[4]={0}, res_asm[4]={0}; \
if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
{ \
- used_asm = 1; \
- res_c[0] = pixel_c.sad[i]( buf1, 16, buf2, 32 ); \
- res_c[1] = pixel_c.sad[i]( buf1, 16, buf2+30, 32 ); \
- res_c[2] = pixel_c.sad[i]( buf1, 16, buf2+1, 32 ); \
- if(N==4) \
+ for( j=0; j<64; j++) \
{ \
- res_c[3] = pixel_c.sad[i]( buf1, 16, buf2+99, 32 ); \
- pixel_asm.sad_x4[i]( buf1, buf2, buf2+30, buf2+1, buf2+99, 32, res_asm ); \
- } \
- else \
- pixel_asm.sad_x3[i]( buf1, buf2, buf2+30, buf2+1, 32, res_asm ); \
- if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
- { \
- ok = 0; \
- fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
- i, res_c[0], res_c[1], res_c[2], res_c[3], \
- res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
+ uint8_t *pix2 = buf2+j; \
+ used_asm = 1; \
+ res_c[0] = pixel_c.sad[i]( buf1, 16, pix2, 32 ); \
+ res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+30, 32 ); \
+ res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1, 32 ); \
+ if(N==4) \
+ { \
+ res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+99, 32 ); \
+ pixel_asm.sad_x4[i]( buf1, pix2, pix2+30, pix2+1, pix2+99, 32, res_asm ); \
+ } \
+ else \
+ pixel_asm.sad_x3[i]( buf1, pix2, pix2+30, pix2+1, 32, res_asm ); \
+ if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
+ i, res_c[0], res_c[1], res_c[2], res_c[3], \
+ res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
+ } \
} \
} \
} \
+ check_quant( cpu_ref, cpu_new );
}
+int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
+{
+ *cpu_ref = *cpu_new;
+ *cpu_new |= flags;
+ fprintf( stderr, "x264: %s\n", name );
+ return check_all( *cpu_ref, *cpu_new );
+}
+
int main(int argc, char *argv[])
{
int ret = 0;
}
#ifdef HAVE_MMX
- fprintf( stderr, "x264: MMXEXT against C\n" );
- cpu1 = X264_CPU_MMX | X264_CPU_MMXEXT;
- ret = check_all( 0, cpu1 );
-
+ if( x264_cpu_detect() & X264_CPU_MMXEXT )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMXEXT" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "MMXEXT Cache64" );
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32, "MMXEXT Cache32" );
+ }
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
- fprintf( stderr, "\nx264: SSE2 against C\n" );
- cpu0 = cpu1;
- cpu1 |= X264_CPU_SSE | X264_CPU_SSE2;
- ret |= check_all( cpu0, cpu1 );
-
- if( x264_cpu_detect() & X264_CPU_SSSE3 )
- {
- fprintf( stderr, "\nx264: SSSE3 against C\n" );
- cpu0 = cpu1;
- cpu1 |= X264_CPU_SSE3 | X264_CPU_SSSE3;
- ret |= check_all( cpu0, cpu1 );
- }
+ cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32);
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2, "SSE2" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSE2 Cache64" );
+ }
+ if( x264_cpu_detect() & X264_CPU_SSE3 )
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3, "SSE3" );
+ if( x264_cpu_detect() & X264_CPU_SSSE3 )
+ {
+ cpu1 &= ~X264_CPU_CACHELINE_SPLIT;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
}
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )