+; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+; {
+; int nmv=0, i, j;
+; *(uint32_t*)(masks+width) = 0;
+; for( i=0; i<width; i+=8 )
+; {
+; uint64_t mask = *(uint64_t*)(masks+i);
+; if( !mask ) continue;
+; for( j=0; j<8; j++ )
+; if( mask & (255<<j*8) )
+; mvs[nmv++] = i+j;
+; }
+; return nmv;
+; }
+cglobal x264_pixel_ads_mvs
+ ; mvs = parm5q
+ ; masks = rsp
+ ; width = r10
+ mov dword [rsp+r10], 0
+ xor eax, eax
+ xor esi, esi
+.loopi:
+ mov rdi, [rsp+rsi]
+ test rdi, rdi
+ jz .nexti
+ xor ecx, ecx
+%macro TEST 1
+ mov [parm5q+rax*2], si
+ test edi, 0xff<<(%1*8)
+ setne cl
+ add eax, ecx
+ inc esi
+%endmacro
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ shr rdi, 32
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ cmp esi, r10d
+ jl .loopi
+ leave
+ ret
+.nexti:
+ add esi, 8
+ cmp esi, r10d
+ jl .loopi
+ leave
+ ret
+
+%macro ADS_START 0
+ push rbp
+ mov rbp, rsp
+ sub rsp, parm6q
+ sub rsp, 4
+ and rsp, ~15
+ mov rax, rsp
+ mov r10d, parm6d
+ shl parm3q, 1
+%endmacro
+
+%macro ADS_END 1
+ add parm2q, 8*%1
+ add parm4q, 8*%1
+ add rax, 4*%1
+ sub parm6d, 4*%1
+ jg .loop
+ jmp x264_pixel_ads_mvs
+%endmacro
+
;-----------------------------------------------------------------------------
-; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *res, int width )
+; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext
movq mm6, [parm1q]
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
- shl parm3q, 1
+ ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+16]
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
- movq [parm4q], mm0
- add parm2q, 8
- add parm4q, 8
- sub parm5d, 4
- jg .loop
- nop
- ret
+ pshufw mm1, [rbp+16], 0
+ paddusw mm0, [parm4q]
+ psubusw mm1, mm0
+ packsswb mm1, mm1
+ movd [rax], mm1
+ ADS_END 1
cglobal x264_pixel_ads2_mmxext
movq mm6, [parm1q]
+ pshufw mm5, parm7q, 0
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
- shl parm3q, 1
+ ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+parm3q]
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
- movq [parm4q], mm0
- add parm2q, 8
- add parm4q, 8
- sub parm5d, 4
- jg .loop
- nop
- ret
+ paddusw mm0, [parm4q]
+ movq mm4, mm5
+ psubusw mm4, mm0
+ packsswb mm4, mm4
+ movd [rax], mm4
+ ADS_END 1
cglobal x264_pixel_ads1_mmxext
pshufw mm7, [parm1q], 0
+ pshufw mm6, parm7q, 0
+ ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+8]
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
- movq [parm4q], mm0
- movq [parm4q+8], mm1
- add parm2q, 16
- add parm4q, 16
- sub parm5d, 8
- jg .loop
- nop
- ret
+ paddusw mm0, [parm4q]
+ paddusw mm1, [parm4q+8]
+ movq mm4, mm6
+ movq mm5, mm6
+ psubusw mm4, mm0
+ psubusw mm5, mm1
+ packsswb mm4, mm5
+ movq [rax], mm4
+ ADS_END 2
+
+%macro ADS_SSE2 1
+cglobal x264_pixel_ads4_%1
+ movdqa xmm4, [parm1q]
+ pshuflw xmm8, parm7q, 0
+ pshuflw xmm7, xmm4, 0
+ pshuflw xmm6, xmm4, 0xAA
+ pshufhw xmm5, xmm4, 0
+ pshufhw xmm4, xmm4, 0xAA
+ punpcklqdq xmm8, xmm8
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpckhqdq xmm5, xmm5
+ punpckhqdq xmm4, xmm4
+ ADS_START
+ movdqu xmm10, [parm2q]
+ movdqu xmm11, [parm2q+parm3q]
+.loop:
+ movdqa xmm0, xmm10
+ movdqu xmm1, [parm2q+16]
+ movdqa xmm10, xmm1
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ movdqa xmm2, xmm11
+ movdqu xmm3, [parm2q+parm3q+16]
+ movdqa xmm11, xmm3
+ psubw xmm2, xmm5
+ psubw xmm3, xmm4
+ paddw xmm0, xmm1
+ movdqu xmm9, [parm4q]
+ MMX_ABS xmm2, xmm1
+ MMX_ABS xmm3, xmm1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm3
+ paddusw xmm0, xmm9
+ movdqa xmm1, xmm8
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [rax], xmm1
+ ADS_END 2
+
+cglobal x264_pixel_ads2_%1
+ movq xmm6, [parm1q]
+ pshuflw xmm8, parm7q, 0
+ pshuflw xmm7, xmm6, 0
+ pshuflw xmm6, xmm6, 0xAA
+ punpcklqdq xmm8, xmm8
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ ADS_START
+.loop:
+ movdqu xmm0, [parm2q]
+ movdqu xmm1, [parm2q+parm3q]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ movdqu xmm9, [parm4q]
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ paddw xmm0, xmm1
+ paddusw xmm0, xmm9
+ movdqa xmm4, xmm8
+ psubusw xmm4, xmm0
+ packsswb xmm4, xmm4
+ movq [rax], xmm4
+ ADS_END 2
+
+cglobal x264_pixel_ads1_%1
+ pshuflw xmm7, [parm1q], 0
+ pshuflw xmm8, parm7q, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm8, xmm8
+ ADS_START
+.loop:
+ movdqu xmm0, [parm2q]
+ movdqu xmm1, [parm2q+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm7
+ movdqu xmm9, [parm4q]
+ movdqu xmm10, [parm4q+16]
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ paddusw xmm0, xmm9
+ paddusw xmm1, xmm10
+ movdqa xmm4, xmm8
+ movdqa xmm5, xmm8
+ psubusw xmm4, xmm0
+ psubusw xmm5, xmm1
+ packsswb xmm4, xmm5
+ movdqa [rax], xmm4
+ ADS_END 4
+%endmacro
+
+ADS_SSE2 sse2
+%ifdef HAVE_SSE3
+%macro MMX_ABS 2
+ pabsw %1, %1
+%endmacro
+ADS_SSE2 ssse3
+%endif
+; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
+cglobal x264_pixel_ads_mvs
+ mov ebx, [ebp+24] ; mvs
+ mov ecx, esp ; masks
+ mov edi, [ebp+28] ; width
+ mov dword [ecx+edi], 0
+ push esi
+ push ebp
+ xor eax, eax
+ xor esi, esi
+.loopi:
+ mov ebp, [ecx+esi]
+ mov edx, [ecx+esi+4]
+ or edx, ebp
+ jz .nexti
+ xor edx, edx
+%macro TEST 1
+ mov [ebx+eax*2], si
+ test ebp, 0xff<<(%1*8)
+ setne dl
+ add eax, edx
+ inc esi
+%endmacro
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ mov ebp, [ecx+esi]
+ TEST 0
+ TEST 1
+ TEST 2
+ TEST 3
+ cmp esi, edi
+ jl .loopi
+ jmp .end
+.nexti:
+ add esi, 8
+ cmp esi, edi
+ jl .loopi
+.end:
+ pop ebp
+ pop esi
+ mov edi, [ebp-8]
+ mov ebx, [ebp-4]
+ leave
+ ret
+
+%macro ADS_START 0
+ push ebp
+ mov ebp, esp
+ push ebx
+ push edi
+ mov eax, [ebp+12] ; sums
+ mov ebx, [ebp+16] ; delta
+ mov ecx, [ebp+20] ; cost_mvx
+ mov edx, [ebp+28] ; width
+ sub esp, edx
+ sub esp, 4
+ and esp, ~15
+ mov edi, esp
+ shl ebx, 1
+%endmacro
+
+%macro ADS_END 1
+ add eax, 8*%1
+ add ecx, 8*%1
+ add edi, 4*%1
+ sub edx, 4*%1
+ jg .loop
+ jmp x264_pixel_ads_mvs
+%endmacro
+
;-----------------------------------------------------------------------------
-; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
-; uint16_t *res, int width )
+; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
+; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext
- push ebx
- mov eax, [esp+8]
+ mov eax, [esp+4]
movq mm6, [eax]
movq mm4, [eax+8]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov ecx, [esp+20]
- mov edx, [esp+24]
- shl ebx, 1
+ ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+16]
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
- movq [ecx], mm0
- add eax, 8
- add ecx, 8
- sub edx, 4
- jg .loop
- pop ebx
- ret
+ pshufw mm1, [ebp+32], 0
+ paddusw mm0, [ecx]
+ psubusw mm1, mm0
+ packsswb mm1, mm1
+ movd [edi], mm1
+ ADS_END 1
cglobal x264_pixel_ads2_mmxext
- push ebx
- mov eax, [esp+8]
+ mov eax, [esp+4]
movq mm6, [eax]
+ pshufw mm5, [esp+28], 0
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov ecx, [esp+20]
- mov edx, [esp+24]
- shl ebx, 1
+ ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+ebx]
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
- movq [ecx], mm0
- add eax, 8
- add ecx, 8
- sub edx, 4
- jg .loop
- pop ebx
- ret
+ paddusw mm0, [ecx]
+ movq mm4, mm5
+ psubusw mm4, mm0
+ packsswb mm4, mm4
+ movd [edi], mm4
+ ADS_END 1
cglobal x264_pixel_ads1_mmxext
mov eax, [esp+4]
pshufw mm7, [eax], 0
- mov eax, [esp+8]
- mov ecx, [esp+16]
- mov edx, [esp+20]
+ pshufw mm6, [esp+28], 0
+ ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+8]
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
- movq [ecx], mm0
- movq [ecx+8], mm1
- add eax, 16
- add ecx, 16
- sub edx, 8
- jg .loop
- nop
- ret
+ paddusw mm0, [ecx]
+ paddusw mm1, [ecx+8]
+ movq mm4, mm6
+ movq mm5, mm6
+ psubusw mm4, mm0
+ psubusw mm5, mm1
+ packsswb mm4, mm5
+ movq [edi], mm4
+ ADS_END 2
+
+%macro ADS_SSE2 1
+cglobal x264_pixel_ads4_%1
+ mov eax, [esp+4] ; enc_dc
+ movdqa xmm4, [eax]
+ pshuflw xmm7, xmm4, 0
+ pshuflw xmm6, xmm4, 0xAA
+ pshufhw xmm5, xmm4, 0
+ pshufhw xmm4, xmm4, 0xAA
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpckhqdq xmm5, xmm5
+ punpckhqdq xmm4, xmm4
+ ADS_START
+.loop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ movdqu xmm2, [eax+ebx]
+ movdqu xmm3, [eax+ebx+16]
+ psubw xmm2, xmm5
+ psubw xmm3, xmm4
+ paddw xmm0, xmm1
+ MMX_ABS xmm2, xmm1
+ MMX_ABS xmm3, xmm1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm3
+ movd xmm1, [ebp+32] ; thresh
+ movdqu xmm2, [ecx]
+ pshuflw xmm1, xmm1, 0
+ punpcklqdq xmm1, xmm1
+ paddusw xmm0, xmm2
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [edi], xmm1
+ ADS_END 2
+
+cglobal x264_pixel_ads2_%1
+ mov eax, [esp+4] ; enc_dc
+ movq xmm6, [eax]
+ movd xmm5, [esp+28] ; thresh
+ pshuflw xmm7, xmm6, 0
+ pshuflw xmm6, xmm6, 0xAA
+ pshuflw xmm5, xmm5, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ punpcklqdq xmm5, xmm5
+ ADS_START
+.loop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax+ebx]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm6
+ movdqu xmm4, [ecx]
+ MMX_ABS xmm0, xmm2
+ MMX_ABS xmm1, xmm3
+ paddw xmm0, xmm1
+ paddusw xmm0, xmm4
+ movdqa xmm1, xmm5
+ psubusw xmm1, xmm0
+ packsswb xmm1, xmm1
+ movq [edi], xmm1
+ ADS_END 2
+
+cglobal x264_pixel_ads1_%1
+ mov eax, [esp+4] ; enc_dc
+ movd xmm7, [eax]
+ movd xmm6, [esp+28] ; thresh
+ pshuflw xmm7, xmm7, 0
+ pshuflw xmm6, xmm6, 0
+ punpcklqdq xmm7, xmm7
+ punpcklqdq xmm6, xmm6
+ ADS_START
+.loop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax+16]
+ psubw xmm0, xmm7
+ psubw xmm1, xmm7
+ movdqu xmm2, [ecx]
+ movdqu xmm3, [ecx+16]
+ MMX_ABS xmm0, xmm4
+ MMX_ABS xmm1, xmm5
+ paddusw xmm0, xmm2
+ paddusw xmm1, xmm3
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm6
+ psubusw xmm4, xmm0
+ psubusw xmm5, xmm1
+ packsswb xmm4, xmm5
+ movdqa [edi], xmm4
+ ADS_END 4
+%endmacro
+
+ADS_SSE2 sse2
+%ifdef HAVE_SSE3
+%macro MMX_ABS 2
+ pabsw %1, %1
+%endmacro
+ADS_SSE2 ssse3
+%endif
const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
-void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
- uint16_t *res, int width );
-void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta,
- uint16_t *res, int width );
-void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta,
- uint16_t *res, int width );
+#define DECL_ADS( size, suffix ) \
+int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
+DECL_ADS( 4, mmxext )
+DECL_ADS( 2, mmxext )
+DECL_ADS( 1, mmxext )
+DECL_ADS( 4, sse2 )
+DECL_ADS( 2, sse2 )
+DECL_ADS( 1, sse2 )
+DECL_ADS( 4, ssse3 )
+DECL_ADS( 2, ssse3 )
+DECL_ADS( 1, ssse3 )
+#undef DECL_ADS
#endif
/****************************************************************************
* successive elimination
****************************************************************************/
-static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
- uint16_t *res, int width )
+static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
- int i;
+ int nmv=0, i;
for( i=0; i<width; i++, sums++ )
- res[i] = abs( enc_dc[0] - sums[0] )
- + abs( enc_dc[1] - sums[8] )
- + abs( enc_dc[2] - sums[delta] )
- + abs( enc_dc[3] - sums[delta+8] );
+ {
+ int ads = abs( enc_dc[0] - sums[0] )
+ + abs( enc_dc[1] - sums[8] )
+ + abs( enc_dc[2] - sums[delta] )
+ + abs( enc_dc[3] - sums[delta+8] )
+ + cost_mvx[i];
+ if( ads < thresh )
+ mvs[nmv++] = i;
+ }
+ return nmv;
}
-static void pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
- uint16_t *res, int width )
+static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
- int i;
+ int nmv=0, i;
for( i=0; i<width; i++, sums++ )
- res[i] = abs( enc_dc[0] - sums[0] )
- + abs( enc_dc[1] - sums[delta] );
+ {
+ int ads = abs( enc_dc[0] - sums[0] )
+ + abs( enc_dc[1] - sums[delta] )
+ + cost_mvx[i];
+ if( ads < thresh )
+ mvs[nmv++] = i;
+ }
+ return nmv;
}
-static void pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
- uint16_t *res, int width )
+static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
- int i;
+ int nmv=0, i;
for( i=0; i<width; i++, sums++ )
- res[i] = abs( enc_dc[0] - sums[0] );
+ {
+ int ads = abs( enc_dc[0] - sums[0] )
+ + cost_mvx[i];
+ if( ads < thresh )
+ mvs[nmv++] = i;
+ }
+ return nmv;
}
pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\
pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu;
+#define INIT_ADS( cpu ) \
+ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
+ pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
+ pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
+
INIT7( sad, );
INIT7( sad_x3, );
INIT7( sad_x4, );
INIT7( ssd, );
INIT7( satd, );
INIT4( sa8d, );
+ INIT_ADS( );
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
- pixf->ads[PIXEL_16x16] = pixel_ads4;
- pixf->ads[PIXEL_16x8] = pixel_ads2;
- pixf->ads[PIXEL_8x8] = pixel_ads1;
-
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
{
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
-
- pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext;
- pixf->ads[PIXEL_16x8 ] = x264_pixel_ads2_mmxext;
- pixf->ads[PIXEL_8x8 ] = x264_pixel_ads1_mmxext;
+ INIT_ADS( _mmxext );
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT5( satd, _sse2 );
+ INIT_ADS( _sse2 );
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_SPLIT )
if( cpu&X264_CPU_SSSE3 )
{
INIT5( satd, _ssse3 );
+ INIT_ADS( _ssse3 );
#ifdef ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
x264_pixel_cmp_x4_t sad_x4[7];
/* abs-diff-sum for successive elimination.
- * may round width up to a multiple of 8. */
- void (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
- uint16_t *res, int width );
+ * may round width up to a multiple of 16. */
+ int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
+ uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
/* calculate satd of V, H, and DC modes.
* may be NULL, in which case just use pred+satd instead. */
static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
+uint16_t *x264_cost_mv_fpel[52][4];
+
/* initialize an array of lambda*nbits for all possible mvs */
static void x264_mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
static int16_t *p_cost_mv[52];
+ int i, j;
if( !p_cost_mv[a->i_qp] )
{
/* could be faster, but isn't called many times */
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
- int i;
p_cost_mv[a->i_qp] = x264_malloc( (4*4*2048 + 1) * sizeof(int16_t) );
p_cost_mv[a->i_qp] += 2*4*2048;
for( i = 0; i <= 2*4*2048; i++ )
p_cost_mv[a->i_qp][i] = a->i_lambda * bs_size_se( i );
}
}
-
a->p_cost_mv = p_cost_mv[a->i_qp];
+
+ /* FIXME is this useful for all me methods? */
+ if( h->param.analyse.i_me_method == X264_ME_ESA && !x264_cost_mv_fpel[a->i_qp][0] )
+ {
+ for( j=0; j<4; j++ )
+ {
+ x264_cost_mv_fpel[a->i_qp][j] = x264_malloc( (4*2048 + 1) * sizeof(int16_t) );
+ x264_cost_mv_fpel[a->i_qp][j] += 2*2048;
+ for( i = -2*2048; i <= 2*2048; i++ )
+ x264_cost_mv_fpel[a->i_qp][j][i] = p_cost_mv[a->i_qp][i*4+j];
+ }
+ }
}
static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int i_qp )
COPY3_IF_LT( bcost, costs[3], bmx, omx+(m3x), bmy, omy+(m3y) );\
}
-#define COST_MV_X4_ABS( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
+#define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\
{\
- h->pixf.sad_x4[i_pixel]( m->p_fenc[0],\
+ h->pixf.sad_x3[i_pixel]( m->p_fenc[0],\
p_fref + (m0x) + (m0y)*m->i_stride[0],\
p_fref + (m1x) + (m1y)*m->i_stride[0],\
p_fref + (m2x) + (m2y)*m->i_stride[0],\
- p_fref + (m3x) + (m3y)*m->i_stride[0],\
m->i_stride[0], costs );\
- costs[0] += p_cost_mvx[m0x<<2]; /* no cost_mvy */\
- costs[1] += p_cost_mvx[m1x<<2];\
- costs[2] += p_cost_mvx[m2x<<2];\
- costs[3] += p_cost_mvx[m3x<<2];\
+ costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\
+ costs[1] += p_cost_mvx[(m1x)<<2];\
+ costs[2] += p_cost_mvx[(m2x)<<2];\
COPY3_IF_LT( bcost, costs[0], bmx, m0x, bmy, m0y );\
COPY3_IF_LT( bcost, costs[1], bmx, m1x, bmy, m1y );\
COPY3_IF_LT( bcost, costs[2], bmx, m2x, bmy, m2y );\
- COPY3_IF_LT( bcost, costs[3], bmx, m3x, bmy, m3y );\
}
/* 1 */
{
const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
- const int max_x = ((X264_MIN( bmx + i_me_range, mv_x_max ) - min_x + 3) & ~3) + min_x - 1;
+ const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
- int mx, my;
+ /* SEA is fastest in multiples of 4 */
+ const int width = (max_x - min_x + 3) & ~3;
+ int my;
#if 0
/* plain old exhaustive search */
+ int mx;
for( my = min_y; my <= max_y; my++ )
for( mx = min_x; mx <= max_x; mx++ )
COST_MV( mx, my );
const int stride = m->i_stride[0];
static uint8_t zero[16*16] = {0,};
uint16_t *sums_base = m->integral;
- int enc_dc[4];
+ DECLARE_ALIGNED( int, enc_dc[4], 16 );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
- uint16_t *ads = x264_malloc((max_x-min_x+8) * sizeof(uint16_t));
+ int16_t xs_buf[64];
+ int16_t *xs = width<=64 ? xs_buf : x264_malloc( width*sizeof(int16_t) );
+ int xn;
+ uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta,
m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE,
for( my = min_y; my <= max_y; my++ )
{
- int mvs[3], i_mvs=0;
bcost -= p_cost_mvy[my<<2];
- h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
- ads, max_x-min_x+1 );
- for( mx = min_x; mx <= max_x; mx++ )
- {
- if( ads[mx-min_x] < bcost - p_cost_mvx[mx<<2] )
- {
- if( i_mvs == 3 )
- {
- COST_MV_X4_ABS( mvs[0],my, mvs[1],my, mvs[2],my, mx,my );
- i_mvs = 0;
- }
- else
- mvs[i_mvs++] = mx;
- }
- }
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+ cost_fpel_mvx+min_x, xs, width, bcost );
+ for( i=0; i<xn-2; i+=3 )
+ COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
bcost += p_cost_mvy[my<<2];
- for( i=0; i<i_mvs; i++ )
- COST_MV( mvs[i], my );
+ for( ; i<xn; i++ )
+ COST_MV( min_x+xs[i], my );
}
- x264_free(ads);
+ if( xs != xs_buf )
+ x264_free( xs );
#endif
}
break;
int x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
+extern uint16_t *x264_cost_mv_fpel[52][4];
+
#define COPY1_IF_LT(x,y)\
if((y)<(x))\
(x)=(y);
x264_predict_t predict_4x4[9+3];
x264_predict8x8_t predict_8x8[9+3];
DECLARE_ALIGNED( uint8_t, edge[33], 8 );
+ uint16_t cost_mv[32];
int ret = 0, ok, used_asm;
int i, j;
}
ok = 1; used_asm = 0;
- for( i=0; i<4; i++ )
- if( pixel_asm.ads[i] != pixel_ref.ads[i] )
+ for( i=0; i<32; i++ )
+ cost_mv[i] = i*10;
+ for( i=0; i<100; i++ )
+ if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
{
- uint16_t res_a[32], res_c[32];
- uint16_t sums[72];
- int dc[4];
+ DECLARE_ALIGNED( uint16_t, sums[72], 16 );
+ DECLARE_ALIGNED( int, dc[4], 16 );
+ int16_t mvs_a[32], mvs_c[32];
+ int mvn_a, mvn_c;
+ int thresh = rand() & 0x3fff;
for( j=0; j<72; j++ )
sums[j] = rand() & 0x3fff;
for( j=0; j<4; j++ )
dc[j] = rand() & 0x3fff;
used_asm = 1;
- pixel_c.ads[i]( dc, sums, 32, res_c, 32 );
- pixel_asm.ads[i]( dc, sums, 32, res_a, 32 );
- if( memcmp(res_a, res_c, sizeof(res_c)) )
+ mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 32, thresh );
+ mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 32, thresh );
+ if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
ok = 0;
}
report( "esa ads:" );