; JDIMENSION output_row, int num_rows);
;
-; r10 = JDIMENSION img_width
+; r10d = JDIMENSION img_width
; r11 = JSAMPARRAY input_buf
; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 8
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 5
push rbx
mov ecx, r10d
.return:
pop rbx
- uncollect_args
+ uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; JDIMENSION output_row, int num_rows);
;
-; r10 = JDIMENSION img_width
+; r10d = JDIMENSION img_width
; r11 = JSAMPARRAY input_buf
; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 5
push rbx
mov ecx, r10d
.return:
pop rbx
- uncollect_args
+ uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; r10 = working_state *state
; r11 = JOCTET *buffer
; r12 = JCOEFPTR block
-; r13 = int last_dc_val
+; r13d = int last_dc_val
; r14 = c_derived_tbl *dctbl
; r15 = c_derived_tbl *actbl
mov [rsp], rax
mov rbp,rsp ; rbp = aligned rbp
lea rsp, [t2]
- collect_args
+ collect_args 6
%ifdef WIN64
movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
add rsp, 4*SIZEOF_XMMWORD
%endif
- uncollect_args
+ uncollect_args 6
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; JSAMPARRAY input_data, JSAMPARRAY output_data);
;
-; r10 = JDIMENSION image_width
+; r10d = JDIMENSION image_width
; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_blocks
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 6
mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jg near .rowloop
.return:
- uncollect_args
+ uncollect_args 6
pop rbp
ret
; JSAMPARRAY input_data, JSAMPARRAY output_data);
;
-; r10 = JDIMENSION image_width
+; r10d = JDIMENSION image_width
; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_blocks
; r14 = JSAMPARRAY input_data
; r15 = JSAMPARRAY output_data
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 6
mov ecx, r13d
shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
jg near .rowloop
.return:
- uncollect_args
+ uncollect_args 6
pop rbp
ret
; JSAMPARRAY output_buf, int num_rows)
;
-; r10 = JDIMENSION out_width
+; r10d = JDIMENSION out_width
; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION input_row
+; r12d = JDIMENSION input_row
; r13 = JSAMPARRAY output_buf
-; r14 = int num_rows
+; r14d = int num_rows
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
%define WK_NUM 2
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 5
push rbx
mov ecx, r10d ; num_cols
.return:
pop rbx
- uncollect_args
+ uncollect_args 5
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; JSAMPARRAY output_buf);
;
-; r10 = JDIMENSION output_width
+; r10d = JDIMENSION output_width
; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
+; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 4
push rbx
mov ecx, r10d ; col
.return:
pop rbx
- uncollect_args
+ uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; JSAMPARRAY output_buf);
;
-; r10 = JDIMENSION output_width
+; r10d = JDIMENSION output_width
; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
+; r12d = JDIMENSION in_row_group_ctr
; r13 = JSAMPARRAY output_buf
align 16
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 4
push rbx
mov eax, r10d
pop rdx
pop rbx
- uncollect_args
+ uncollect_args 4
pop rbp
ret
;
; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
+; r11d = JDIMENSION downsampled_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 4
mov eax, r11d ; colctr
test rax, rax
jg near .rowloop
.return:
- uncollect_args
+ uncollect_args 4
pop rbp
ret
;
; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
+; r11d = JDIMENSION downsampled_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 4
push rbx
mov eax, r11d ; colctr
.return:
pop rbx
- uncollect_args
+ uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
;
; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
+; r11d = JDIMENSION output_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 4
mov edx, r11d
add rdx, byte (2*SIZEOF_XMMWORD)-1
jg short .rowloop
.return:
- uncollect_args
+ uncollect_args 4
pop rbp
ret
;
; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
+; r11d = JDIMENSION output_width
; r12 = JSAMPARRAY input_data
; r13 = JSAMPARRAY *output_data_ptr
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 4
push rbx
mov edx, r11d
.return:
pop rbx
- uncollect_args
+ uncollect_args 4
pop rbp
ret
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 1
; ---- Pass 1: process rows.
dec rcx
jnz near .columnloop
- uncollect_args
+ uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 1
; ---- Pass 1: process rows.
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
- uncollect_args
+ uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 1
; ---- Pass 1: process rows.
movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
- uncollect_args
+ uncollect_args 1
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; r10 = void *dct_table
; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
+; r13d = JDIMENSION output_col
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [workspace]
- collect_args
+ collect_args 4
push rbx
; ---- Pass 1: process columns from input, store into work array.
jnz near .rowloop
pop rbx
- uncollect_args
+ uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; r10 = jpeg_component_info *compptr
; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
+; r13d = JDIMENSION output_col
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 4
; ---- Pass 1: process columns from input.
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
- uncollect_args
+ uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; r10 = jpeg_component_info *compptr
; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
+; r13d = JDIMENSION output_col
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 4
; ---- Pass 1: process columns from input.
movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
- uncollect_args
+ uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; r10 = void *dct_table
; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
+; r13d = JDIMENSION output_col
%define original_rbp rbp+0
%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
mov [rsp], rax
mov rbp, rsp ; rbp = aligned rbp
lea rsp, [wk(0)]
- collect_args
+ collect_args 4
; ---- Pass 1: process columns from input.
movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
- uncollect_args
+ uncollect_args 4
mov rsp, rbp ; rsp <- aligned rbp
pop rsp ; rsp <- original rbp
pop rbp
; r10 = void *dct_table
; r11 = JCOEFPTR coef_block
; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
+; r13d = JDIMENSION output_col
align 16
global EXTN(jsimd_idct_2x2_sse2)
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 4
push rbx
; ---- Pass 1: process columns from input.
mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx
pop rbx
- uncollect_args
+ uncollect_args 4
pop rbp
ret
;
; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
+; r11d = JDIMENSION start_col
; r12 = FAST_FLOAT *workspace
align 16
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 3
push rbx
pcmpeqw xmm7, xmm7
jnz short .convloop
pop rbx
- uncollect_args
+ uncollect_args 3
pop rbp
ret
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 3
mov rsi, r12
mov rdx, r11
dec rax
jnz short .quantloop
- uncollect_args
+ uncollect_args 3
pop rbp
ret
;
; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
+; r11d = JDIMENSION start_col
; r12 = DCTELEM *workspace
align 16
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 3
push rbx
pxor xmm6, xmm6 ; xmm6=(all 0's)
jnz short .convloop
pop rbx
- uncollect_args
+ uncollect_args 3
pop rbp
ret
push rbp
mov rax, rsp
mov rbp, rsp
- collect_args
+ collect_args 3
mov rsi, r12
mov rdx, r11
dec rax
jnz near .quantloop
- uncollect_args
+ uncollect_args 3
pop rbp
ret
%ifdef WIN64
-%imacro collect_args 0
- push r12
- push r13
- push r14
- push r15
+%imacro collect_args 1
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm6
+ sub rsp, SIZEOF_XMMWORD
+ movaps XMMWORD [rsp], xmm7
mov r10, rcx
+%if %1 > 1
mov r11, rdx
+%endif
+%if %1 > 2
+ push r12
mov r12, r8
+%endif
+%if %1 > 3
+ push r13
mov r13, r9
+%endif
+%if %1 > 4
+ push r14
mov r14, [rax+48]
+%endif
+%if %1 > 5
+ push r15
mov r15, [rax+56]
+%endif
push rsi
push rdi
- sub rsp, SIZEOF_XMMWORD
- movaps XMMWORD [rsp], xmm6
- sub rsp, SIZEOF_XMMWORD
- movaps XMMWORD [rsp], xmm7
%endmacro
-%imacro uncollect_args 0
- movaps xmm7, XMMWORD [rsp]
- add rsp, SIZEOF_XMMWORD
- movaps xmm6, XMMWORD [rsp]
- add rsp, SIZEOF_XMMWORD
+%imacro uncollect_args 1
pop rdi
pop rsi
+%if %1 > 5
pop r15
+%endif
+%if %1 > 4
pop r14
+%endif
+%if %1 > 3
pop r13
+%endif
+%if %1 > 2
pop r12
+%endif
+ movaps xmm7, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
+ movaps xmm6, XMMWORD [rsp]
+ add rsp, SIZEOF_XMMWORD
%endmacro
%else
-%imacro collect_args 0
+%imacro collect_args 1
push r10
- push r11
- push r12
- push r13
- push r14
- push r15
mov r10, rdi
+%if %1 > 1
+ push r11
mov r11, rsi
+%endif
+%if %1 > 2
+ push r12
mov r12, rdx
+%endif
+%if %1 > 3
+ push r13
mov r13, rcx
+%endif
+%if %1 > 4
+ push r14
mov r14, r8
+%endif
+%if %1 > 5
+ push r15
mov r15, r9
+%endif
%endmacro
-%imacro uncollect_args 0
+%imacro uncollect_args 1
+%if %1 > 5
pop r15
+%endif
+%if %1 > 4
pop r14
+%endif
+%if %1 > 3
pop r13
+%endif
+%if %1 > 2
pop r12
+%endif
+%if %1 > 1
pop r11
+%endif
pop r10
%endmacro