A "make distclean" is probably required after updating to this revision.
default: $(DEP) x264$(EXE)
libx264.a: .depend $(OBJS) $(OBJASM)
- ar rc libx264.a $(OBJS) $(OBJASM)
- ranlib libx264.a
+ $(AR) rc libx264.a $(OBJS) $(OBJASM)
+ $(RANLIB) libx264.a
$(SONAME): .depend $(OBJS) $(OBJASM)
$(CC) -shared -o $@ $(OBJS) $(OBJASM) $(SOFLAGS) $(LDFLAGS)
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
# delete local/anonymous symbols, so they don't show up in oprofile
- -@ strip -x $@
+ -@ $(STRIP) -x $@
.depend: config.mak
rm -f .depend
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(SONAME) *.a x264 x264.exe .depend TAGS
- rm -f checkasm checkasm.exe tools/checkasm.o
+ rm -f checkasm checkasm.exe tools/checkasm.o tools/checkasm-a.o
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno)
- sed -e 's/ *-fprofile-\(generate\|use\)//g' config.mak > config.mak2 && mv config.mak2 config.mak
install -m 644 libx264.a $(DESTDIR)$(libdir)
install -m 644 x264.pc $(DESTDIR)$(libdir)/pkgconfig
install x264$(EXE) $(DESTDIR)$(bindir)
- ranlib $(DESTDIR)$(libdir)/libx264.a
+ $(RANLIB) $(DESTDIR)$(libdir)/libx264.a
ifeq ($(SYS),MINGW)
$(if $(SONAME), install -m 755 $(SONAME) $(DESTDIR)$(bindir))
else
buf = (uint8_t *) malloc( i_size + 15 + sizeof( void ** ) +
sizeof( int ) );
align_buf = buf + 15 + sizeof( void ** ) + sizeof( int );
- align_buf -= (long) align_buf & 15;
+ align_buf -= (intptr_t) align_buf & 15;
*( (void **) ( align_buf - sizeof( void ** ) ) ) = buf;
*( (int *) ( align_buf - sizeof( void ** ) - sizeof( int ) ) ) = i_size;
return align_buf;
{
int64_t i_ssd = 0;
int x, y;
- int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);
+ int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
pix2 + y*i_pix2 + x, i_pix2 );
cextern x264_cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
-%ifdef ARCH_X86_64
+%ifdef WIN64
+ DECLARE_REG_TMP 3,1,2,0,4,5,6,10
+ %define pointer resq
+%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,10
%define pointer resq
%else
%endmacro
cglobal x264_cabac_encode_decision_asm, 0,7
- movifnidn t0d, r0m
+ movifnidn t0, r0mp
movifnidn t1d, r1m
- mov t5d, [r0+cb.range]
- movzx t3d, byte [r0+cb.state+t1]
+ mov t5d, [t0+cb.range]
+ movzx t3d, byte [t0+cb.state+t1]
mov t4d, t5d
shr t5d, 6
and t5d, 3
shr t6d, 6
movifnidn t2d, r2m
cmp t6d, t2d
- mov t6d, [r0+cb.low]
+ mov t6d, [t0+cb.low]
lea t7, [t6+t4]
cmovne t4d, t5d
cmovne t6d, t7d
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
movifnidn t1d, r1m
- mov [r0+cb.state+t1], t3b
+ mov [t0+cb.state+t1], t3b
.renorm:
mov t3d, t4d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
- add t3d, [r0+cb.queue]
- mov [r0+cb.range], t4d
- mov [r0+cb.low], t6d
- mov [r0+cb.queue], t3d
+ add t3d, [t0+cb.queue]
+ mov [t0+cb.range], t4d
+ mov [t0+cb.low], t6d
+ mov [t0+cb.queue], t3d
cmp t3d, 8
jge .putbyte
REP_RET
sub t3d, 10
and t6d, t1d
cmp t2b, 0xff ; FIXME is a 32bit op faster?
- mov [r0+cb.queue], t3d
- mov [r0+cb.low], t6d
+ mov [t0+cb.queue], t3d
+ mov [t0+cb.low], t6d
mov t1d, t2d
- mov t4, [r0+cb.p]
+ mov t4, [t0+cb.p]
je .postpone
- mov t5d, [r0+cb.bytes_outstanding]
+ mov t5d, [t0+cb.bytes_outstanding]
shr t1d, 8 ; carry
add [t4-1], t1b
test t5d, t5d
.no_outstanding:
mov [t4], t2b
inc t4
- mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
- mov [r0+cb.p], t4
+ mov [t0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
+ mov [t0+cb.p], t4
RET
.postpone:
- inc dword [r0+cb.bytes_outstanding]
+ inc dword [t0+cb.bytes_outstanding]
RET
SECTION .text
%ifdef ARCH_X86_64
+
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
-cglobal x264_cpu_cpuid
+cglobal x264_cpu_cpuid, 5,7
push rbx
- mov r10, r3
- mov r11, r2
- mov r9, r1
+ mov r11, r1
+ mov r10, r2
+ movifnidn r9, r3
+ movifnidn r8, r4
mov eax, r0d
cpuid
- mov [r9], eax
- mov [r11], ebx
- mov [r10], ecx
+ mov [r11], eax
+ mov [r10], ebx
+ mov [r9], ecx
mov [r8], edx
pop rbx
- ret
+ RET
%else
call ecx
leave
ret
+
%endif
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3
-global x264_sub8x8_dct8_mmx %+ .skip_prologue
+global x264_sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
call load_diff_4x8_mmx
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_mmx, 2,2
-global x264_add8x8_idct8_mmx %+ .skip_prologue
+global x264_add8x8_idct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2, 3,3
-global x264_sub8x8_dct8_sse2 %+ .skip_prologue
+global x264_sub8x8_dct8_sse2.skip_prologue
.skip_prologue:
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2
-global x264_add8x8_idct8_sse2 %+ .skip_prologue
+global x264_add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub8x8_dct8_sse2
+cglobal x264_sub8x8_dct8_sse2, 3,3,10
LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
movdqa [r0+0x50], m5
movdqa [r0+0x60], m6
movdqa [r0+0x70], m7
- ret
+ RET
%macro IDCT8_1D 10
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2
+cglobal x264_add8x8_idct8_sse2, 2,2,10
movdqa m0, [r1+0x00]
movdqa m1, [r1+0x10]
movdqa m2, [r1+0x20]
STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
- ret
+ RET
INIT_XMM
-cglobal x264_sub8x8_dct_sse2, 3,3
+cglobal x264_sub8x8_dct_sse2, 3,3,8
.skip_prologue:
call .8x4
add r0, 64
add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE
+%ifdef WIN64
+ call .8x4
+ RET
+%endif
.8x4:
SUB_DCT4 2x4x4W
movhps [r0+32], m0
movhps [r0+56], m3
ret
-cglobal x264_add8x8_idct_sse2, 2,2
+cglobal x264_add8x8_idct_sse2, 2,2,8
.skip_prologue:
call .8x4
add r1, 64
add r0, 4*FDEC_STRIDE
+%ifdef WIN64
+ call .8x4
+ RET
+%endif
.8x4:
movq m0, [r1+ 0]
movq m1, [r1+ 8]
%macro SUB_NxN_DCT 6
cglobal %1, 3,3
.skip_prologue:
+%ifdef WIN64
+ sub rsp, 8
+%endif
call %2
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
+%ifdef WIN64
+ add rsp, 8
+%endif
jmp %2
%endmacro
%macro ADD_NxN_IDCT 6
cglobal %1, 2,2
.skip_prologue:
+%ifdef WIN64
+ sub rsp, 8
+%endif
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
+%ifdef WIN64
+ add rsp, 8
+%endif
jmp %2
%endmacro
%ifndef ARCH_X86_64
-SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
-ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
-SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
+SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
+ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
+SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
+ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
-%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
+SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
+%define x264_sub8x8_dct_sse2 x264_sub8x8_dct_sse2.skip_prologue
+%define x264_add8x8_idct_sse2 x264_add8x8_idct_sse2.skip_prologue
+%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
-SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
+SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2, 64, 8, 0, 4
+ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2, 64, 8, 0, 4
cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2
punpcklbw mm1, mm1
ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
ADD_DC mm2, mm3, r0
- ret
+ RET
cglobal x264_add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
movhps [r0+FDEC_STRIDE* 1], xmm3
movhps [r0+FDEC_STRIDE* 2], xmm4
movhps [r0+FDEC_STRIDE* 3], xmm5
- ret
+ RET
cglobal x264_add16x16_idct_dc_mmx, 2,3
mov r2, 4
add r0, FDEC_STRIDE*4
dec r2
jg .loop
- ret
+ REP_RET
%macro IDCT_DC_STORE 3
movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
-cglobal x264_add16x16_idct_dc_sse2, 2,2
+cglobal x264_add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
+%ifdef WIN64
+ call .loop
+ RET
+%endif
.loop:
add r0, FDEC_STRIDE*4
movq xmm0, [r1+0]
IDCT_DC_STORE 0, xmm2, xmm3
ret
-cglobal x264_add16x16_idct_dc_ssse3, 2,2
+cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
+%ifdef WIN64
+ call .loop
+ RET
+%endif
.loop:
add r0, FDEC_STRIDE*4
movdqa xmm0, [r1]
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
-cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
+cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
+cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE]
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal x264_deblock_v_luma_sse2
+cglobal x264_deblock_v_luma_sse2, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
- ret
+ RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
-cglobal x264_deblock_h_luma_sse2
- movsxd r10, esi
+cglobal x264_deblock_h_luma_sse2, 5,7
+ movsxd r10, r1d
lea r11, [r10+r10*2]
- lea rax, [r0-4]
- lea r9, [r0-4+r11]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r11]
+%ifdef WIN64
+ sub rsp, 0x98
+ %define pix_tmp rsp+0x30
+%else
sub rsp, 0x68
%define pix_tmp rsp
+%endif
; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp
- lea rax, [rax+r10*8]
- lea r9, [r9 +r10*8]
- TRANSPOSE6x8_MEM PASS8ROWS(rax, r9, r10, r11), pix_tmp+8
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
+ lea r6, [r6+r10*8]
+ lea r5, [r5+r10*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup rax, r9, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
+ ; don't backup r6, r5, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
- mov esi, 0x10
+ mov r1d, 0x10
+%ifdef WIN64
+ mov [rsp+0x20], r4
+%endif
call x264_deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- add rax, 2
- add r9, 2
+ add r6, 2
+ add r5, 2
movq m0, [pix_tmp+0x18]
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+ TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
- sub rax, r10
- sub r9, r10
+ sub r6, r10
+ sub r5, r10
shr r10, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4_STORE PASS8ROWS(rax, r9, r10, r11)
+ TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
+%ifdef WIN64
+ add rsp, 0x98
+%else
add rsp, 0x68
- ret
+%endif
+ RET
%else
mova m3, [r0+r1] ; q1
LOAD_MASK r2, r3
- mov r3, r4m
+ mov r3, r4mp
movd m4, [r3] ; tc0
punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_deblock_h_luma_%1, 0,5
- mov r0, r0m
+ mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
ADD esp, 20
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
- mov r0, r0m
+ mov r0, r0mp
sub r0, 2
lea r1, [r0+r4]
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_%2_luma_intra_%1, 4,6
+cglobal x264_deblock_%2_luma_intra_%1, 4,6,16
%ifndef ARCH_X86_64
sub esp, 0x60
%endif
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal x264_deblock_h_luma_intra_%1
+cglobal x264_deblock_h_luma_intra_%1, 4,7
movsxd r10, r1d
lea r11, [r10*3]
- lea rax, [r0-4]
- lea r9, [r0-4+r11]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r11]
sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea rax, [rax+r10*8]
- lea r9, [r9+r10*8]
- TRANSPOSE8x8_MEM PASS8ROWS(rax, r9, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r6, [r6+r10*8]
+ lea r5, [r5+r10*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call x264_deblock_v_luma_intra_%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
- lea r9, [rax+r11]
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+ lea r5, [r6+r11]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
- sub rax, r10
- sub r9, r10
+ sub r6, r10
+ sub r5, r10
shr r10, 3
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(rax, r9, r10, r11)
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
add rsp, 0x88
- ret
+ RET
%else
cglobal x264_deblock_h_luma_intra_%1, 2,4
lea r3, [r1*3]
ADD esp, 16
mov r1, r1m
- mov r0, r0m
+ mov r0, r0mp
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,10,11
- %macro AVG_START 0
- PROLOGUE 6,7
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,7,%1
+%ifdef WIN64
+ movsxd r5, r5d
+%endif
.height_loop:
%endmacro
%else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
- %macro AVG_START 0
- PROLOGUE 0,7
+ %macro AVG_START 0-1 0
+ PROLOGUE 0,7,%1
mov t0, r0m
mov t1, r1m
mov t2, r2m
%macro BIWEIGHT_MMX 2
movh m0, %1
movh m1, %2
- punpcklbw m0, m7
- punpcklbw m1, m7
- pmullw m0, m4
- pmullw m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, m2
+ pmullw m1, m3
paddw m0, m1
- paddw m0, m6
+ paddw m0, m4
psraw m0, 6
%endmacro
%macro BIWEIGHT_START_MMX 0
- movd m4, r6m
- SPLATW m4, m4 ; weight_dst
- mova m5, [pw_64 GLOBAL]
- psubw m5, m4 ; weight_src
- mova m6, [pw_32 GLOBAL] ; rounding
- pxor m7, m7
+ movd m2, r6m
+ SPLATW m2, m2 ; weight_dst
+ mova m3, [pw_64 GLOBAL]
+ psubw m3, m2 ; weight_src
+ mova m4, [pw_32 GLOBAL] ; rounding
+ pxor m5, m5
%endmacro
%macro BIWEIGHT_SSSE3 2
movh m0, %1
movh m1, %2
punpcklbw m0, m1
- pmaddubsw m0, m5
- paddw m0, m6
+ pmaddubsw m0, m3
+ paddw m0, m4
psraw m0, 6
%endmacro
sub t7d, t6d
shl t7d, 8
add t6d, t7d
- movd m5, t6d
- mova m6, [pw_32 GLOBAL]
- SPLATW m5, m5 ; weight_dst,src
+ movd m3, t6d
+ mova m4, [pw_32 GLOBAL]
+ SPLATW m3, m3 ; weight_dst,src
%endmacro
%macro BIWEIGHT_ROW 4
packuswb m0, m0
movh [%1], m0
%else
- SWAP 0, 2
+ SWAP 0, 6
BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
- packuswb m2, m0
- mova [%1], m2
+ packuswb m6, m0
+ mova [%1], m6
%endif
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
-%macro AVG_WEIGHT 2
-cglobal x264_pixel_avg_weight_w%2_%1, 0,0
+%macro AVG_WEIGHT 2-3 0
+cglobal x264_pixel_avg_weight_w%2_%1
BIWEIGHT_START
- AVG_START
+ AVG_START %3
%if %2==8 && mmsize==16
BIWEIGHT [t2], [t4]
- SWAP 0, 2
+ SWAP 0, 6
BIWEIGHT [t2+t3], [t4+t5]
- packuswb m2, m0
- movlps [t0], m2
- movhps [t0+t1], m2
+ packuswb m6, m0
+ movlps [t0], m6
+ movhps [t0+t1], m6
%else
%assign x 0
%rep 1+%2/(mmsize*2)
AVG_WEIGHT mmxext, 16
INIT_XMM
%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
-AVG_WEIGHT sse2, 8
-AVG_WEIGHT sse2, 16
+AVG_WEIGHT sse2, 8, 7
+AVG_WEIGHT sse2, 16, 7
%define BIWEIGHT BIWEIGHT_SSSE3
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
INIT_MMX
AVG_WEIGHT ssse3, 4
INIT_XMM
-AVG_WEIGHT ssse3, 8
-AVG_WEIGHT ssse3, 16
+AVG_WEIGHT ssse3, 8, 7
+AVG_WEIGHT ssse3, 16, 7
; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 3
-cglobal x264_pixel_avg_%1x%2_%3,0,0
+cglobal x264_pixel_avg_%1x%2_%3
mov eax, %2
cmp dword r6m, 32
jne x264_pixel_avg_weight_w%1_%3
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
-cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0
+cglobal x264_pixel_avg2_w%1_cache%2_%3
mov eax, r2m
and eax, 0x1f|(%2>>1)
cmp eax, (32-%1)|(%2>>1)
add r0, r1
dec r5d
jg .height_loop
- RET
+ REP_RET
x264_pixel_avg2_w16_cache_mmxext:
AVG_CACHELINE_START
add r0, r1
dec r5d
jg .height_loop
- RET
+ REP_RET
x264_pixel_avg2_w20_cache_mmxext:
AVG_CACHELINE_START
add r0, r1
dec r5d
jg .height_loop
- RET
+ REP_RET
%ifndef ARCH_X86_64
AVG_CACHELINE_CHECK 8, 32, mmxext
lea r2, [r2+r4+64]
prefetcht0 [r2]
prefetcht0 [r2+r3]
- ret
+ RET
%else
cglobal x264_prefetch_fenc_mmxext
prefetcht0 [r0+r1]
prefetcht0 [r0+r1*2]
prefetcht0 [r0+r2]
- ret
+ RET
%endif
%macro MC_CHROMA_START 0
- movifnidn r2d, r2m
+ movifnidn r2, r2mp
movifnidn r3d, r3m
movifnidn r4d, r4m
movifnidn r5d, r5m
; int dx, int dy,
; int width, int height )
;-----------------------------------------------------------------------------
-%macro MC_CHROMA 1
-cglobal x264_mc_chroma_%1, 0,6
+%macro MC_CHROMA 1-2 0
+cglobal x264_mc_chroma_%1
%if mmsize == 16
cmp dword r6m, 4
- jle x264_mc_chroma_mmxext %+ .skip_prologue
+ jle x264_mc_chroma_mmxext
%endif
-.skip_prologue:
+ PROLOGUE 0,6,%2
MC_CHROMA_START
pxor m3, m3
and r4d, 7 ; dx &= 7
mov r10, r0
mov r11, r2
%else
- mov r0, r0m
+ mov r0, r0mp
mov r1, r1m
mov r5, r2
%endif
lea r0, [r10+4] ; dst
lea r2, [r11+4] ; src
%else
- mov r0, r0m
+ mov r0, r0mp
lea r2, [r5+4]
- add r0, 4
+ add r0, 4
%endif
mov r4d, r7m ; height
jmp .loop2d
SPLATW m6, m6
mova m7, [pw_4 GLOBAL]
psubw m5, m6
- movifnidn r0d, r0m
+ movifnidn r0, r0mp
movifnidn r1d, r1m
mov r4d, r7m
%if mmsize == 8
INIT_MMX
MC_CHROMA mmxext
INIT_XMM
-MC_CHROMA sse2
+MC_CHROMA sse2, 8
INIT_MMX
-cglobal x264_mc_chroma_ssse3, 0,6
+cglobal x264_mc_chroma_ssse3, 0,6,8
MC_CHROMA_START
and r4d, 7
and r5d, 7
mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
- movifnidn r0d, r0m
+ movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
- movifnidn r0d, r0m
+ movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
INIT_MMX
-%macro HPEL_V 1
+%macro HPEL_V 1-2 0
;-----------------------------------------------------------------------------
; void x264_hpel_filter_v_mmxext( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_v_%1, 5,6
+cglobal x264_hpel_filter_v_%1, 5,6,%2
+%ifdef WIN64
+ movsxd r4, r4d
+%endif
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
;-----------------------------------------------------------------------------
; void x264_hpel_filter_c_sse2( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_c_%1, 3,3
+cglobal x264_hpel_filter_c_%1, 3,3,9
add r0, r2
lea r1, [r1+r2*2]
neg r2
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_sse2( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_h_sse2, 3,3
+cglobal x264_hpel_filter_h_sse2, 3,3,8
add r0, r2
add r1, r2
neg r2
%ifndef ARCH_X86_64
HPEL_C sse2
%endif
-HPEL_V sse2
+HPEL_V sse2, 8
HPEL_C sse2_misalign
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
; void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
-cglobal x264_hpel_filter_%1, 7,7
+cglobal x264_hpel_filter_%1, 7,7,16
+%ifdef WIN64
+ movsxd r4, r4d
+ movsxd r5, r5d
+%endif
mov r10, r3
sub r5, 16
mov r11, r1
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
-%macro FRAME_INIT_LOWRES 1 ; FIXME
-cglobal x264_frame_init_lowres_core_%1, 6,7
+%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
+cglobal x264_frame_init_lowres_core_%1, 6,7,%2
+%ifdef WIN64
+ movsxd r5, r5d
+%endif
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
dec r6d
FRAME_INIT_LOWRES cache32_mmxext
%endif
INIT_XMM
-FRAME_INIT_LOWRES sse2
+FRAME_INIT_LOWRES sse2, 12
%define PALIGNR PALIGNR_SSSE3
-FRAME_INIT_LOWRES ssse3
+FRAME_INIT_LOWRES ssse3, 12
static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
int stride, int width, int height, int16_t *buf )\
{\
- int realign = (long)src & (align-1);\
+ int realign = (intptr_t)src & (align-1);\
src -= realign;\
dstv -= realign;\
dstc -= realign;\
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-%macro SSD 3
-cglobal x264_pixel_ssd_%1x%2_%3, 4,4
+%macro SSD 3-4 0
+cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4
%if %1 >= mmsize
pxor m7, m7
%endif
SSD 4, 8, mmx
SSD 4, 4, mmx
INIT_XMM
-SSD 16, 16, sse2
-SSD 16, 8, sse2
-SSD 8, 16, sse2
-SSD 8, 8, sse2
-SSD 8, 4, sse2
+SSD 16, 16, sse2, 8
+SSD 16, 8, sse2, 8
+SSD 8, 16, sse2, 5
+SSD 8, 8, sse2, 5
+SSD 8, 4, sse2, 5
cglobal x264_pixel_ssd_4x8_sse4, 4,4
SSD_QUARTER 0, 0, r1, r3, 0, 1
VAR_END 6
INIT_XMM
-cglobal x264_pixel_var_16x16_sse2, 2,3
+cglobal x264_pixel_var_16x16_sse2, 2,3,8
VAR_START
VAR_2ROW r1, 8
VAR_END 8
-cglobal x264_pixel_var_8x8_sse2, 2,3
+cglobal x264_pixel_var_8x8_sse2, 2,3,8
VAR_START
mov t3d, 4
.loop:
lea r0, [r10+8]
lea r2, [r11+8]
%else
- mov r0, r0m
- mov r2, r2m
+ mov r0, r0mp
+ mov r2, r2mp
add r0, 8
add r2, 8
%endif
SATD_8x4_SSE2 %1
ret
-cglobal x264_pixel_satd_16x16_%1, 4,6
+cglobal x264_pixel_satd_16x16_%1, 4,6,8
SATD_START_SSE2
BACKUP_POINTERS
call x264_pixel_satd_8x8_internal_%1
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
-cglobal x264_pixel_satd_16x8_%1, 4,6
+cglobal x264_pixel_satd_16x8_%1, 4,6,8
SATD_START_SSE2
BACKUP_POINTERS
call x264_pixel_satd_8x8_internal_%1
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
-cglobal x264_pixel_satd_8x16_%1, 4,6
+cglobal x264_pixel_satd_8x16_%1, 4,6,8
SATD_START_SSE2
call x264_pixel_satd_8x8_internal_%1
lea r0, [r0+4*r1]
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
-cglobal x264_pixel_satd_8x8_%1, 4,6
+cglobal x264_pixel_satd_8x8_%1, 4,6,8
SATD_START_SSE2
call x264_pixel_satd_8x8_internal_%1
SATD_END_SSE2
-cglobal x264_pixel_satd_8x4_%1, 4,6
+cglobal x264_pixel_satd_8x4_%1, 4,6,8
SATD_START_SSE2
call x264_pixel_satd_8x4_internal_%1
SATD_END_SSE2
-cglobal x264_pixel_satd_4x8_%1, 4,6
+cglobal x264_pixel_satd_4x8_%1, 4,6,8
INIT_XMM
LOAD_MM_PERMUTATION satd_4x8_internal
%define movh movd
pavgw m0, m4
ret
-cglobal x264_pixel_sa8d_8x8_%1, 4,6
+cglobal x264_pixel_sa8d_8x8_%1, 4,6,10
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1
movd eax, m0
add eax, 1
shr eax, 1
- ret
+ RET
-cglobal x264_pixel_sa8d_16x16_%1, 4,6
+cglobal x264_pixel_sa8d_16x16_%1, 4,6,11
lea r4, [3*r1]
lea r5, [3*r3]
call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
movd eax, m0
add eax, 1
shr eax, 1
- ret
+ RET
%else ; ARCH_X86_32
cglobal x264_pixel_sa8d_8x8_internal_%1
;-----------------------------------------------------------------------------
; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
;-----------------------------------------------------------------------------
-cglobal x264_intra_sa8d_x3_8x8_core_%1
+cglobal x264_intra_sa8d_x3_8x8_core_%1, 3,3,16
; 8x8 hadamard
pxor m8, m8
movq m0, [r0+0*FENC_STRIDE]
HADAMARD8_1D m0, m1, m2, m3, m4, m5, m6, m7
; dc
- movzx edi, word [r1+0]
- add di, word [r1+16]
- add edi, 8
- and edi, -16
- shl edi, 2
+ movzx r0d, word [r1+0]
+ add r0w, word [r1+16]
+ add r0d, 8
+ and r0d, -16
+ shl r0d, 2
pxor m15, m15
movdqa m8, m2
movdqa m14, m15 ; 7x8 sum
movdqa m8, [r1+0] ; left edge
- movd m9, edi
+ movd m9, r0d
psllw m8, 3
psubw m8, m0
psubw m9, m0
movq [r2], m3 ; i8x8_v, i8x8_h
psrldq m3, 8
movd [r2+8], m3 ; i8x8_dc
- ret
+ RET
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2
SUM_MM_X3 m0, m4, m5, m1, m2, m3, m6, pavgw
%ifndef ARCH_X86_64
- mov r2, r2m
+ mov r2, r2mp
%endif
movd [r2+0], m0 ; i4x4_v satd
movd [r2+4], m4 ; i4x4_h satd
%define sums rsp+64 ; size 24
%define top_1d rsp+32 ; size 32
%define left_1d rsp ; size 32
- movifnidn r1d, r1m
+ movifnidn r1, r1mp
CLEAR_SUMS
; 1D hadamards
and t2d, -16 ; dc
; 2D hadamards
- movifnidn r0d, r0m
+ movifnidn r0, r0mp
xor r3d, r3d
.loop_y:
xor r4d, r4d
jl .loop_y
; horizontal sum
- movifnidn r2d, r2m
+ movifnidn r2, r2mp
movq m2, [sums+16]
movq m1, [sums+8]
movq m0, [sums+0]
%define dc_1d rsp+32 ; size 16
%define top_1d rsp+16 ; size 16
%define left_1d rsp ; size 16
- movifnidn r1d, r1m
+ movifnidn r1, r1mp
CLEAR_SUMS
; 1D hadamards
lea r5, [dc_1d]
; 2D hadamards
- movifnidn r0d, r0m
- movifnidn r2d, r2m
+ movifnidn r0, r0mp
+ movifnidn r2, r2mp
xor r3d, r3d
.loop_y:
xor r4d, r4d
; struct { int satd, int sa8d; } x264_pixel_hadamard_ac_16x16( uint8_t *pix, int stride )
%macro HADAMARD_AC_WXH_SSE2 3
-cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3
+cglobal x264_pixel_hadamard_ac_%1x%2_%3, 2,3,11
%assign pad 16-gprsize-(stack_offset&15)
%define ysub r1
sub rsp, 48+pad
; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4
+cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
pxor m0, m0
pxor m1, m1
pxor m2, m2
punpckldq m3, m4
punpckhdq m5, m4
-%ifdef ARCH_X86_64
+%ifdef UNIX64
%define t0 r4
%else
- %define t0 eax
- mov t0, r4m
+ %define t0 rax
+ mov t0, r4mp
%endif
movq [t0+ 0], m1
;-----------------------------------------------------------------------------
; float x264_pixel_ssim_end_sse2( int sum0[5][4], int sum1[5][4], int width )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_end4_sse2, 3,3
+cglobal x264_pixel_ssim_end4_sse2, 3,3,7
movdqa m0, [r0+ 0]
movdqa m1, [r0+16]
movdqa m2, [r0+32]
%macro ADS_START 1 ; unroll_size
%ifdef ARCH_X86_64
%define t0 r6
+%ifdef WIN64
+ mov r4, r4mp
+ movsxd r5, dword r5m
+%endif
mov r10, rsp
%else
%define t0 r4
add t0, 4*%1
sub r0d, 4*%1
jg .loop
+%ifdef WIN64
+ RESTORE_XMM r10
+%endif
jmp ads_mvs
%endmacro
ABS1 mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
-%ifdef ARCH_X86_64
+%ifdef WIN64
+ pshufw mm1, [r10+stack_offset+56], 0
+%elifdef ARCH_X86_64
pshufw mm1, [r10+8], 0
%else
pshufw mm1, [ebp+stack_offset+28], 0
ADS_END 2
%macro ADS_SSE2 1
-cglobal x264_pixel_ads4_%1, 4,7
+cglobal x264_pixel_ads4_%1, 4,7,12
movdqa xmm4, [r0]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
%endif ; ARCH
ADS_END 2
-cglobal x264_pixel_ads2_%1, 4,7
+cglobal x264_pixel_ads2_%1, 4,7,8
movq xmm6, [r0]
movd xmm5, r6m
pshuflw xmm7, xmm6, 0
movq [t0], xmm1
ADS_END 2
-cglobal x264_pixel_ads1_%1, 4,7
+cglobal x264_pixel_ads1_%1, 4,7,8
movd xmm7, [r0]
movd xmm6, r6m
pshuflw xmm7, xmm7, 0
; }
; return nmv;
; }
-cglobal x264_pixel_ads_mvs
+cglobal x264_pixel_ads_mvs, 0,7,0
ads_mvs:
- xor eax, eax
- xor esi, esi
%ifdef ARCH_X86_64
; mvs = r4
; masks = rsp
; width = r5
; clear last block in case width isn't divisible by 8. (assume divisible by 4, so clearing 4 bytes is enough.)
- mov dword [rsp+r5], 0
+%ifdef WIN64
+ mov r8, r4
+ mov r9, r5
+%endif
+ xor eax, eax
+ xor esi, esi
+ mov dword [rsp+r9], 0
jmp .loopi
.loopi0:
add esi, 8
- cmp esi, r5d
+ cmp esi, r9d
jge .end
.loopi:
mov rdi, [rsp+rsi]
jz .loopi0
xor ecx, ecx
%macro TEST 1
- mov [r4+rax*2], si
+ mov [r8+rax*2], si
test edi, 0xff<<(%1*8)
setne cl
add eax, ecx
TEST 1
TEST 2
TEST 3
- cmp esi, r5d
+ cmp esi, r9d
jl .loopi
.end:
mov rsp, r10
- ret
+ RET
%else
- ; no PROLOGUE, inherit from x264_pixel_ads1
+ xor eax, eax
+ xor esi, esi
mov ebx, [ebp+stack_offset+20] ; mvs
mov edi, [ebp+stack_offset+24] ; width
mov dword [esp+edi], 0
;-----------------------------------------------------------------------------
; void predict_8x8_vr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_vr_sse2, 2,2
+cglobal predict_8x8_vr_sse2, 2,2,7
movdqu xmm0, [r1+8]
movdqa xmm6, [pw_ff00 GLOBAL]
add r0, 4*FDEC_STRIDE
mova [r0+FDEC_STRIDE*n], m0
%assign n n+1
%endrep
- REP_RET
+ RET
%endmacro
INIT_MMX
;-----------------------------------------------------------------------------
; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-cglobal predict_16x16_p_core_sse2, 1,2
+cglobal predict_16x16_p_core_sse2, 1,2,8
movd xmm0, r1m
movd xmm1, r2m
movd xmm2, r3m
punpcklqdq xmm0, xmm0
packuswb xmm0, xmm0
STORE16x16_SSE2 xmm0
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
-%macro QUANT_DC 2
-cglobal %1, 1,1
+%macro QUANT_DC 2-3 0
+cglobal %1, 1,1,%3
QUANT_DC_START
%if %2==1
QUANT_ONE [r0], m6, m7, 0
%endif
INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse2, 2
+QUANT_DC x264_quant_4x4_dc_sse2, 2, 8
QUANT_AC x264_quant_4x4_sse2, 2
QUANT_AC x264_quant_8x8_sse2, 8
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
-QUANT_DC x264_quant_4x4_dc_ssse3, 2
+QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8
QUANT_AC x264_quant_4x4_ssse3, 2
QUANT_AC x264_quant_8x8_ssse3, 8
;Not faster on Conroe, so only used in SSE4 versions
%define QUANT_DC_START QUANT_DC_START_SSSE3
INIT_XMM
-QUANT_DC x264_quant_4x4_dc_sse4, 2
+QUANT_DC x264_quant_4x4_dc_sse4, 2, 8
QUANT_AC x264_quant_4x4_sse4, 2
QUANT_AC x264_quant_8x8_sse4, 8
%macro DEQUANT16_L 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; m5 i_qbits
+;;; m2 i_qbits
mova m0, %2
packssdw m0, %3
pmullw m0, %1
- psllw m0, m5
+ psllw m0, m2
mova %1, m0
%endmacro
%macro DEQUANT32_R 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
-;;; m5 -i_qbits
-;;; m6 f
-;;; m7 0
+;;; m2 -i_qbits
+;;; m3 f
+;;; m4 0
mova m0, %1
mova m1, m0
- punpcklwd m0, m7
- punpckhwd m1, m7
+ punpcklwd m0, m4
+ punpckhwd m1, m4
pmaddwd m0, %2
pmaddwd m1, %3
- paddd m0, m6
- paddd m1, m6
- psrad m0, m5
- psrad m1, m5
+ paddd m0, m3
+ paddd m1, m3
+ psrad m0, m2
+ psrad m1, m2
packssdw m0, m1
mova %1, m0
%endmacro
%1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3]
sub t0d, 16*%3
jge %%loop
- rep ret
+ REP_RET
%else
%1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
%1 [r0 ], [r1 ], [r1+ 8*%3]
- ret
+ RET
%endif
%endmacro
-%macro DEQUANT16_FLAT 2-8
+%macro DEQUANT16_FLAT 2-5
mova m0, %1
%assign i %0-2
%rep %0-1
%else
pmullw m0, [r0+%2]
%endif
- psllw m %+ i, m7
+ psllw m %+ i, m4
mova [r0+%2], m %+ i
%assign i i-1
%rotate 1
%endrep
%endmacro
-%ifdef ARCH_X86_64
+%ifdef WIN64
+ DECLARE_REG_TMP 6,3,2
+%elifdef ARCH_X86_64
DECLARE_REG_TMP 4,3,2
%else
DECLARE_REG_TMP 2,0,1
%ifdef ARCH_X86_64
add r1, t2 ; dequant_mf[i_mf]
%else
- add r1, r1m ; dequant_mf[i_mf]
- mov r0, r0m ; dct
+ add r1, r1mp ; dequant_mf[i_mf]
+ mov r0, r0mp ; dct
%endif
sub t0d, %2
jl .rshift32 ; negative qbits => rightshift
;-----------------------------------------------------------------------------
%macro DEQUANT 4
cglobal x264_dequant_%2x%2_%1, 0,3
+.skip_prologue:
DEQUANT_START %3+2, %3
.lshift:
- movd m5, t0d
+ movd m2, t0d
DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
.rshift32:
neg t0d
- movd m5, t0d
- mova m6, [pd_1 GLOBAL]
- pxor m7, m7
- pslld m6, m5
- psrld m6, 1
+ movd m2, t0d
+ mova m3, [pd_1 GLOBAL]
+ pxor m4, m4
+ pslld m3, m2
+ psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4
cglobal x264_dequant_%2x%2_flat16_%1, 0,3
movifnidn t2d, r2m
%if %2 == 8
cmp t2d, 12
- jl x264_dequant_%2x%2_%1
+ jl x264_dequant_%2x%2_%1.skip_prologue
sub t2d, 12
%endif
imul t0d, t2d, 0x2b
%else
lea r1, [dequant%2_scale + t2 GLOBAL]
%endif
- movifnidn r0d, r0m
- movd m7, t0d
+ movifnidn r0, r0mp
+ movd m4, t0d
%if %2 == 4
%ifidn %1, mmx
DEQUANT16_FLAT [r1], 0, 16
DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
DEQUANT16_FLAT [r1+32], 32, 96
%endif
- ret
+ RET
%endmacro ; DEQUANT
%ifndef ARCH_X86_64
DEQUANT_START 6, 6
.lshift:
- movd m6, [r1]
- movd m5, t0d
- pslld m6, m5
+ movd m3, [r1]
+ movd m2, t0d
+ pslld m3, m2
%if mmsize==16
- pshuflw m6, m6, 0
- punpcklqdq m6, m6
+ pshuflw m3, m3, 0
+ punpcklqdq m3, m3
%else
- pshufw m6, m6, 0
+ pshufw m3, m3, 0
%endif
%assign x 0
%rep 16/mmsize
mova m0, [r0+mmsize*0+x]
mova m1, [r0+mmsize*1+x]
- pmullw m0, m6
- pmullw m1, m6
+ pmullw m0, m3
+ pmullw m1, m3
mova [r0+mmsize*0+x], m0
mova [r0+mmsize*1+x], m1
%assign x x+mmsize*2
.rshift32:
neg t0d
- movd m5, t0d
- mova m6, [pw_1 GLOBAL]
- mova m7, m6
- pslld m6, m5
- psrld m6, 1
- movd m4, [r1]
+ movd m3, t0d
+ mova m4, [pw_1 GLOBAL]
+ mova m5, m4
+ pslld m4, m3
+ psrld m4, 1
+ movd m2, [r1]
%if mmsize==8
- punpcklwd m4, m4
+ punpcklwd m2, m2
%else
- pshuflw m4, m4, 0
+ pshuflw m2, m2, 0
%endif
- punpcklwd m4, m6
+ punpcklwd m2, m4
%assign x 0
%rep 32/mmsize
mova m0, [r0+x]
mova m1, m0
- punpcklwd m0, m7
- punpckhwd m1, m7
- pmaddwd m0, m4
- pmaddwd m1, m4
- psrad m0, m5
- psrad m1, m5
+ punpcklwd m0, m5
+ punpckhwd m1, m5
+ pmaddwd m0, m2
+ pmaddwd m1, m2
+ psrad m0, m3
+ psrad m1, m3
packssdw m0, m1
mova [r0+x], m0
%assign x x+mmsize
;-----------------------------------------------------------------------------
; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 1
-cglobal x264_denoise_dct_%1, 4,5
+%macro DENOISE_DCT 1-2 0
+cglobal x264_denoise_dct_%1, 4,5,%2
movzx r4d, word [r0] ; backup DC coefficient
- pxor m7, m7
+ pxor m6, m6
.loop:
sub r3, mmsize
mova m2, [r0+r3*2+0*mmsize]
mova [r0+r3*2+1*mmsize], m1
mova m2, m4
mova m3, m5
- punpcklwd m4, m7
- punpckhwd m2, m7
- punpcklwd m5, m7
- punpckhwd m3, m7
+ punpcklwd m4, m6
+ punpckhwd m2, m6
+ punpcklwd m5, m6
+ punpckhwd m3, m6
paddd m4, [r1+r3*4+0*mmsize]
paddd m2, [r1+r3*4+1*mmsize]
paddd m5, [r1+r3*4+2*mmsize]
DENOISE_DCT mmx
%endif
INIT_XMM
-DENOISE_DCT sse2
+DENOISE_DCT sse2, 7
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
-DENOISE_DCT ssse3
+DENOISE_DCT ssse3, 7
%else
%define table x264_decimate_table8
%endif
- mova m7, [pb_1 GLOBAL]
- DECIMATE_MASK r1d, eax, r0, m7, %1, null
+ mova m5, [pb_1 GLOBAL]
+ DECIMATE_MASK r1d, eax, r0, m5, %1, null
test eax, eax
jne .ret9
- DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
+ DECIMATE_MASK r2d, eax, r0+32, m5, %1, null
shl r2d, 16
or r1d, r2d
- DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
+ DECIMATE_MASK r2d, r3d, r0+64, m5, %1, null
shl r2, 32
or eax, r3d
or r1, r2
- DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
+ DECIMATE_MASK r2d, r3d, r0+96, m5, %1, null
shl r2, 48
or r1, r2
xor r1, -1
RET
%else
cglobal x264_coeff_last4_%1, 0,3
- mov edx, r0m
+ mov edx, r0mp
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
%endmacro
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
-%ifdef ARCH_X86_64
+%ifdef WIN64
+ DECLARE_REG_TMP 3,1,2,0,4,5,6
+%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6
%else
DECLARE_REG_TMP 6,3,2,1,4,5,0
%macro COEFF_LEVELRUN 2
cglobal x264_coeff_level_run%2_%1,0,7
- movifnidn t0d, r0m
- movifnidn t1d, r1m
+ movifnidn t0, r0mp
+ movifnidn t1, r1mp
pxor m2, m2
LAST_MASK t5d, t0-(%2&1)*2, t4d
not t5d
;-----------------------------------------------------------------------------
; int x264_pixel_sad_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_sad_16x16_%1, 4,4
+cglobal x264_pixel_sad_16x16_%1, 4,4,8
movdqu m0, [r2]
movdqu m1, [r2+r3]
lea r2, [r2+2*r3]
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
-%macro INTRA_SAD16 1
-cglobal x264_intra_sad_x3_16x16_%1,3,5
+%macro INTRA_SAD16 1-2 0
+cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1-FDEC_STRIDE+0]
%define SPLATB SPLATB_MMX
INTRA_SAD16 mmxext
INIT_XMM
-INTRA_SAD16 sse2
+INTRA_SAD16 sse2, 8
%define SPLATB SPLATB_SSSE3
-INTRA_SAD16 ssse3
+INTRA_SAD16 ssse3, 8
%endmacro
%macro SAD_X3_END 0
-%ifdef ARCH_X86_64
+%ifdef UNIX64
movd [r5+0], mm0
movd [r5+4], mm1
movd [r5+8], mm2
%else
- mov r0, r5m
+ mov r0, r5mp
movd [r0+0], mm0
movd [r0+4], mm1
movd [r0+8], mm2
%endmacro
%macro SAD_X4_END 0
- mov r0, r6m
+ mov r0, r6mp
movd [r0+0], mm0
movd [r0+4], mm1
movd [r0+8], mm2
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal x264_pixel_sad_x%1_%2x%3_mmxext, %1+2, %1+2
+%ifdef WIN64
+ %assign i %1+1
+ movsxd r %+ i, r %+ i %+ d
+%endif
SAD_X%1_2x%2P 1
%rep %3/2-1
SAD_X%1_2x%2P 0
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
-%ifdef ARCH_X86_64
+%ifdef UNIX64
movd [r5+0], xmm0
movd [r5+4], xmm1
movd [r5+8], xmm2
%else
- mov r0, r5m
+ mov r0, r5mp
movd [r0+0], xmm0
movd [r0+4], xmm1
movd [r0+8], xmm2
%endmacro
%macro SAD_X4_END_SSE2 0
- mov r0, r6m
+ mov r0, r6mp
psllq xmm1, 32
psllq xmm3, 32
paddw xmm0, xmm1
; uint8_t *pix2, int i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
+cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1,9
+%ifdef WIN64
+ %assign i %1+1
+ movsxd r %+ i, r %+ i %+ d
+%endif
SAD_X%1_2x%2P_SSE2 1
%rep %3/2-1
SAD_X%1_2x%2P_SSE2 0
%endmacro
%macro SAD_X_SSE2_MISALIGN 4
-cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1
+cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1,9
+%ifdef WIN64
+ %assign i %1+1
+ movsxd r %+ i, r %+ i %+ d
+%endif
SAD_X%1_2x%2P_SSE2_MISALIGN 1
%rep %3/2-1
SAD_X%1_2x%2P_SSE2_MISALIGN 0
%endmacro
%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
-cglobal x264_pixel_sad_16x%2_cache64_%1, 0,0
+cglobal x264_pixel_sad_16x%2_cache64_%1
mov eax, r2m
and eax, 0x37
cmp eax, 0x30
%endmacro
%macro SAD16_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_16x%1_cache%2_mmxext, 0,0
+cglobal x264_pixel_sad_16x%1_cache%2_mmxext
SAD_CACHELINE_START_MMX2 16, %1, %1, %2
.loop:
movq mm1, [r2]
%endmacro
%macro SAD8_CACHELINE_FUNC_MMX2 2 ; height, cacheline
-cglobal x264_pixel_sad_8x%1_cache%2_mmxext, 0,0
+cglobal x264_pixel_sad_8x%1_cache%2_mmxext
SAD_CACHELINE_START_MMX2 8, %1, %1/2, %2
.loop:
movq mm1, [r2+8]
%endmacro
%macro SADX3_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5, 0,0
+cglobal x264_pixel_sad_x3_%1x%2_cache%3_%5
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
jmp x264_pixel_sad_x3_%1x%2_%4
.split:
%ifdef ARCH_X86_64
+ PROLOGUE 6,7
+%ifdef WIN64
+ movsxd r4, r4d
+ sub rsp, 8
+%endif
push r3
push r2
mov r2, r1
mov r11, r5
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
+%ifdef WIN64
+ mov r2, [rsp]
+%else
pop r2
+%endif
mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
+%ifdef WIN64
+ mov r2, [rsp+8]
+%else
pop r2
+%endif
mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
+%ifdef WIN64
+ add rsp, 24
+%endif
+ RET
%else
push edi
mov edi, [esp+28]
mov [edi+8], eax
add esp, 16
pop edi
-%endif
ret
+%endif
%endmacro
%macro SADX4_CACHELINE_FUNC 5 ; width, height, cacheline, normal_ver, split_ver
-cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5, 0,0
+cglobal x264_pixel_sad_x4_%1x%2_cache%3_%5
CHECK_SPLIT r1m, %1, %3
CHECK_SPLIT r2m, %1, %3
CHECK_SPLIT r3m, %1, %3
jmp x264_pixel_sad_x4_%1x%2_%4
.split:
%ifdef ARCH_X86_64
- mov r11, r6m
+ PROLOGUE 6,7
+ mov r11, r6mp
+%ifdef WIN64
+ movsxd r5, r5d
+%endif
push r4
push r3
push r2
mov r10, r0
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11], eax
+%ifdef WIN64
+ mov r2, [rsp]
+%else
pop r2
+%endif
mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+4], eax
+%ifdef WIN64
+ mov r2, [rsp+8]
+%else
pop r2
+%endif
mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+8], eax
+%ifdef WIN64
+ mov r2, [rsp+16]
+%else
pop r2
+%endif
mov r0, r10
call x264_pixel_sad_%1x%2_cache%3_%5
mov [r11+12], eax
+%ifdef WIN64
+ add rsp, 24
+%endif
+ RET
%else
push edi
mov edi, [esp+32]
mov [edi+12], eax
add esp, 16
pop edi
-%endif
ret
+%endif
%endmacro
%macro SADX34_CACHELINE_FUNC 5
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
+%ifdef ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64
+ %else
+ %define UNIX64
+ %endif
+%endif
+
; FIXME: All of the 64bit asm functions that take a stride as an argument
; via register, assume that the high dword of that register is filled with 0.
; This is true in practice (since we never do any 64bit arithmetic on strides,
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
-%ifndef ARCH_X86_64
+%ifdef WIN64
+ %define PIC
+%elifndef ARCH_X86_64
%undef PIC
%endif
%ifdef PIC
; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used. pushes callee-saved regs if needed.
-; %3 = list of names to define to registers
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
; which are slow when a normal ret follows a branch.
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
%macro DECLARE_REG 6
%define r%1q %2
%define r%1d %3
%define r%1w %4
%define r%1b %5
%define r%1m %6
+ %ifid %6 ; i.e. it's a register
+ %define r%1mp %2
+ %elifdef ARCH_X86_64 ; memory
+ %define r%1mp qword %6
+ %else
+ %define r%1mp dword %6
+ %endif
%define r%1 %2
%endmacro
%assign n_arg_names %%i
%endmacro
-%ifdef ARCH_X86_64 ;========================================================
+%ifdef WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx, ecx, cx, cl, ecx
+DECLARE_REG 1, rdx, edx, dx, dl, edx
+DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
+DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
+DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
+DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
+DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
+%define r7m [rsp + stack_offset + 64]
+%define r8m [rsp + stack_offset + 72]
+
+%macro LOAD_IF_USED 2 ; reg_id, number_of_args
+ %if %1 < %2
+ mov r%1, [rsp + stack_offset + 8 + %1*8]
+ %endif
+%endmacro
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+ ASSERT %2 >= %1
+ %assign regs_used %2
+ ASSERT regs_used <= 7
+ %if %0 > 2
+ %assign xmm_regs_used %3
+ %else
+ %assign xmm_regs_used 0
+ %endif
+ ASSERT xmm_regs_used <= 16
+ %if regs_used > 4
+ push r4
+ push r5
+ %assign stack_offset stack_offset+16
+ %endif
+ %if xmm_regs_used > 6
+ sub rsp, (xmm_regs_used-6)*16+16
+ %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
+ %endrep
+ %endif
+ LOAD_IF_USED 4, %1
+ LOAD_IF_USED 5, %1
+ LOAD_IF_USED 6, %1
+ DEFINE_ARGS %4
+%endmacro
+
+%macro RESTORE_XMM_INTERNAL 1
+ %if xmm_regs_used > 6
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
+ %endrep
+ add %1, (xmm_regs_used-6)*16+16
+ %endif
+%endmacro
+
+%macro RESTORE_XMM 1
+ RESTORE_XMM_INTERNAL %1
+ %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+ %assign xmm_regs_used 0
+%endmacro
+
+%macro RET 0
+ RESTORE_XMM_INTERNAL rsp
+ %if regs_used > 4
+ pop r5
+ pop r4
+ %endif
+ ret
+%endmacro
+
+%macro REP_RET 0
+ %if regs_used > 4 || xmm_regs_used > 6
+ RET
+ %else
+ rep ret
+ %endif
+%endmacro
+
+%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
DECLARE_REG 0, rdi, edi, di, dil, edi
DECLARE_REG 1, rsi, esi, si, sil, esi
%endif
%endmacro
-%macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
ASSERT %2 >= %1
ASSERT %2 <= 7
- %assign stack_offset 0
LOAD_IF_USED 6, %1
- DEFINE_ARGS %3
+ DEFINE_ARGS %4
%endmacro
%macro RET 0
%endif
%endmacro
-%macro PROLOGUE 2-3+ ; #args, #regs, arg_names...
+%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
ASSERT %2 >= %1
- %assign stack_offset 0
%assign regs_used %2
ASSERT regs_used <= 7
PUSH_IF_USED 3
LOAD_IF_USED 4, %1
LOAD_IF_USED 5, %1
LOAD_IF_USED 6, %1
- DEFINE_ARGS %3
+ DEFINE_ARGS %4
%endmacro
%macro RET 0
; Symbol prefix for C linkage
%macro cglobal 1-2+
+ %ifdef PREFIX
+ %xdefine %1.skip_prologue _%1.skip_prologue
+ %xdefine %1 _%1
+ %endif
%ifidn __OUTPUT_FORMAT__,elf
- %ifdef PREFIX
- global _%1:function hidden
- %define %1 _%1
- %else
- global %1:function hidden
- %endif
+ global %1:function hidden
%else
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
+ global %1
%endif
align function_align
%1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+ %assign stack_offset 0
%if %0 > 1
PROLOGUE %2
%endif
%macro cextern 1
%ifdef PREFIX
- extern _%1
- %define %1 _%1
- %else
- extern %1
+ %xdefine %1 _%1
%endif
+ extern %1
%endmacro
; This is needed for ELF, otherwise the GNU linker assumes the stack is
echo " --extra-cflags=ECFLAGS add ECFLAGS to CFLAGS"
echo " --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
echo " --host=HOST build programs to run on HOST"
+echo " --cross-prefix=PREFIX use PREFIX for compilation tools"
echo ""
exit 1
fi
vis="no"
shared="no"
-CC="${CC-gcc}"
CFLAGS="$CFLAGS -Wall -I."
LDFLAGS="$LDFLAGS"
+ASFLAGS="$ASFLAGS"
HAVE_GETOPT_LONG=1
-
-AS=""
-ASFLAGS=""
+cross_prefix=""
EXE=""
--host=*)
host="${opt#--host=}"
;;
+ --cross-prefix=*)
+ cross_prefix="${opt#--cross-prefix=}"
+ ;;
*)
echo "Unknown option $opt, ignored"
;;
esac
done
+CC="${CC-${cross_prefix}gcc}"
+AR="${AR-${cross_prefix}ar}"
+RANLIB="${RANLIB-${cross_prefix}ranlib}"
+STRIP="${STRIP-${cross_prefix}strip}"
+AS=""
+
if [ "x$host" = x ]; then
host=`./config.guess`
fi
ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX"
CFLAGS="$CFLAGS -arch x86_64"
LDFLAGS="$LDFLAGS -arch x86_64"
+ elif [ "$SYS" = MINGW ]; then
+ ASFLAGS="$ASFLAGS -f win32 -m amd64 -DPREFIX"
else
- ASFLAGS="-f elf -m amd64"
+ ASFLAGS="$ASFLAGS -f elf -m amd64"
fi
;;
powerpc|powerpc64)
ARCH="UltraSparc"
CFLAGS="$CFLAGS -mcpu=ultrasparc"
LDFLAGS="$LDFLAGS -mcpu=ultrasparc"
- AS="as"
+ AS="${cross_prefix}as"
ASFLAGS="$ASFLAGS -xarch=v8plusa"
else
ARCH="Sparc"
pthread="yes"
libpthread="-lpthreadGC2 -lwsock32"
CFLAGS="$CFLAGS -DPTW32_STATIC_LIB"
+ elif cc_check pthread.h "-lpthreadGC2 -lws2_32 -DPTW32_STATIC_LIB" "pthread_create(0,0,0,0);" ; then
+ pthread="yes"
+ libpthread="-lpthreadGC2 -lws2_32"
+ CFLAGS="$CFLAGS -DPTW32_STATIC_LIB"
fi
;;
OPENBSD)
fi
fi
if [ "$avis_input" = "yes" ] ; then
- echo "#define AVIS_INPUT" >> config.h
- LDFLAGS="$LDFLAGS -lvfw32"
+ if cc_check "stdlib.h" -lvfw32 ; then
+ echo "#define AVIS_INPUT" >> config.h
+ LDFLAGS="$LDFLAGS -lvfw32"
+ elif cc_check "stdlib.h" -lavifil32 ; then
+ echo "#define AVIS_INPUT" >> config.h
+ LDFLAGS="$LDFLAGS -lavifil32"
+ else
+ avis_input="no";
+ fi
fi
if [ "$pic" = "yes" ] ; then
CFLAGS=$CFLAGS
ALTIVECFLAGS=$ALTIVECFLAGS
LDFLAGS=$LDFLAGS
+AR=$AR
+RANLIB=$RANLIB
+STRIP=$STRIP
AS=$AS
ASFLAGS=$ASFLAGS
EXE=$EXE
int
getopt_long (argc, argv, optstring, long_options, opt_index)
+ int argc;
+ char *const *argv;
+ const char *optstring;
+ const struct option *long_options;
+ int *opt_index;
{
return _getopt_internal (argc, argv, optstring, long_options, opt_index, 0);
}
SECTION_RODATA
-error_message: db "failed to preserve register", 10, 0
+error_message: db "failed to preserve register", 0
+
+%ifdef WIN64
+; just random numbers to reduce the chance of incidental match
+ALIGN 16
+n4: dq 0xa77809bf11b239d1
+n5: dq 0x2ba9bf3d2f05b389
+x6: ddq 0x79445c159ce790641a1b2550a612b48c
+x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd
+x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943
+x9: ddq 0xd229e1f5b281303facbd382dcf5b8de2
+x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9
+x11: ddq 0x77d410d5c42c882d89b0c0765892729a
+x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5
+x13: ddq 0xdd7b8919edd427862e8ec680de14b47c
+x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf
+x15: ddq 0x6de8f4c914c334d5011ff554472a7a10
+%endif
SECTION .text
-cextern printf
+cextern puts
; max number of args used by any x264 asm function.
; (max_args % 4) must equal 3 for stack alignment
%define max_args 11
+%ifdef WIN64
+
+;-----------------------------------------------------------------------------
+; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
+;-----------------------------------------------------------------------------
+cglobal x264_checkasm_call, 4,7,16
+ sub rsp, max_args*8
+ %assign stack_offset stack_offset+max_args*8
+ mov r6, r0
+ mov [rsp+stack_offset+16], r1
+ mov r0, r2
+ mov r1, r3
+ mov r2d, r4m ; FIXME truncates pointer
+ mov r3d, r5m ; FIXME truncates pointer
+%assign i 4
+%rep max_args-4
+ mov r4, [rsp+stack_offset+8+(i+2)*8]
+ mov [rsp+i*8], r4
+ %assign i i+1
+%endrep
+%assign i 6
+%rep 16-6
+ movdqa xmm %+ i, [x %+ i GLOBAL]
+ %assign i i+1
+%endrep
+ mov r4, [n4 GLOBAL]
+ mov r5, [n5 GLOBAL]
+ call r6
+ xor r4, [n4 GLOBAL]
+ xor r5, [n5 GLOBAL]
+ or r4, r5
+ pxor xmm5, xmm5
+%assign i 6
+%rep 16-6
+ pxor xmm %+ i, [x %+ i GLOBAL]
+ por xmm5, xmm %+ i
+ %assign i i+1
+%endrep
+ packsswb xmm5, xmm5
+ movq r5, xmm5
+ or r4, r5
+ jz .ok
+ mov r4, rax
+ lea r0, [error_message GLOBAL]
+ call puts
+ mov r1, [rsp+stack_offset+16]
+ mov dword [r1], 0
+ mov rax, r4
+.ok:
+ add rsp, max_args*8
+ %assign stack_offset stack_offset-max_args*8
+ RET
+
+%elifndef ARCH_X86_64
+
; just random numbers to reduce the chance of incidental match
%define n3 dword 0x6549315c
%define n4 dword 0xe02f3e23
%define n5 dword 0xb78d0d1d
%define n6 dword 0x33627ba7
-%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
-; long x264_checkasm_call( long (*func)(), int *ok, ... )
+; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
cglobal x264_checkasm_call, 1,7
mov r3, n3
mov r3, eax
lea r1, [error_message GLOBAL]
push r1
- xor eax, eax
- call printf
+ call puts
add esp, 4
mov r1, r1m
mov dword [r1], 0
mov eax, r3
.ok:
RET
+
%endif ; ARCH_X86_64
;-----------------------------------------------------------------------------
#define call_c1(func,...) func(__VA_ARGS__)
-#ifdef ARCH_X86
+#if defined(ARCH_X86) || defined(_WIN64)
/* detect when callee-saved regs aren't saved.
* needs an explicit asm check because it only sometimes crashes in normal use. */
-long x264_checkasm_call( long (*func)(), int *ok, ... );
-#define call_a1(func,...) x264_checkasm_call((long(*)())func, &ok, __VA_ARGS__)
+intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... );
+#define call_a1(func,...) x264_checkasm_call((intptr_t(*)())func, &ok, __VA_ARGS__)
#else
#define call_a1 call_c1
#endif