From 16b121890c37f0054a662949cd17af6baef993f9 Mon Sep 17 00:00:00 2001 From: DRC Date: Sun, 29 May 2016 10:51:16 -0500 Subject: [PATCH] x86-64 SIMD: Optimize argument collection Expand collect_args/uncollect_args macros so that the number of arguments can be specified. This prevents unnecessary push and mov instructions. NOTE: On Windows, the push/pop of xmm6 and xmm7 had to be moved to the other end of the macro to ensure that rsp is aligned on a 16-byte boundary. --- simd/jccolext-sse2-64.asm | 10 ++--- simd/jcgryext-sse2-64.asm | 10 ++--- simd/jchuff-sse2-64.asm | 6 +-- simd/jcsample-sse2-64.asm | 20 +++++----- simd/jdcolext-sse2-64.asm | 10 ++--- simd/jdmrgext-sse2-64.asm | 16 ++++---- simd/jdsample-sse2-64.asm | 24 ++++++------ simd/jfdctflt-sse-64.asm | 4 +- simd/jfdctfst-sse2-64.asm | 4 +- simd/jfdctint-sse2-64.asm | 4 +- simd/jidctflt-sse2-64.asm | 6 +-- simd/jidctfst-sse2-64.asm | 6 +-- simd/jidctint-sse2-64.asm | 6 +-- simd/jidctred-sse2-64.asm | 12 +++--- simd/jquantf-sse2-64.asm | 10 ++--- simd/jquanti-sse2-64.asm | 10 ++--- simd/jsimdext.inc | 80 +++++++++++++++++++++++++++++---------- 17 files changed, 138 insertions(+), 100 deletions(-) diff --git a/simd/jccolext-sse2-64.asm b/simd/jccolext-sse2-64.asm index bf60459..258dfad 100644 --- a/simd/jccolext-sse2-64.asm +++ b/simd/jccolext-sse2-64.asm @@ -27,11 +27,11 @@ ; JDIMENSION output_row, int num_rows); ; -; r10 = JDIMENSION img_width +; r10d = JDIMENSION img_width ; r11 = JSAMPARRAY input_buf ; r12 = JSAMPIMAGE output_buf -; r13 = JDIMENSION output_row -; r14 = int num_rows +; r13d = JDIMENSION output_row +; r14d = int num_rows %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 8 @@ -48,7 +48,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 5 push rbx mov ecx, r10d @@ -475,7 +475,7 @@ EXTN(jsimd_rgb_ycc_convert_sse2): .return: pop rbx - uncollect_args + uncollect_args 5 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jcgryext-sse2-64.asm b/simd/jcgryext-sse2-64.asm index 504e295..663104e 100644 --- a/simd/jcgryext-sse2-64.asm +++ b/simd/jcgryext-sse2-64.asm @@ -27,11 +27,11 @@ ; JDIMENSION output_row, int num_rows); ; -; r10 = JDIMENSION img_width +; r10d = JDIMENSION img_width ; r11 = JSAMPARRAY input_buf ; r12 = JSAMPIMAGE output_buf -; r13 = JDIMENSION output_row -; r14 = int num_rows +; r13d = JDIMENSION output_row +; r14d = int num_rows %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 @@ -48,7 +48,7 @@ EXTN(jsimd_rgb_gray_convert_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 5 push rbx mov ecx, r10d @@ -354,7 +354,7 @@ EXTN(jsimd_rgb_gray_convert_sse2): .return: pop rbx - uncollect_args + uncollect_args 5 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jchuff-sse2-64.asm b/simd/jchuff-sse2-64.asm index 701cbc2..486fd80 100644 --- a/simd/jchuff-sse2-64.asm +++ b/simd/jchuff-sse2-64.asm @@ -172,7 +172,7 @@ EXTN(jconst_huff_encode_one_block): ; r10 = working_state *state ; r11 = JOCTET *buffer ; r12 = JCOEFPTR block -; r13 = int last_dc_val +; r13d = int last_dc_val ; r14 = c_derived_tbl *dctbl ; r15 = c_derived_tbl *actbl @@ -193,7 +193,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): mov [rsp], rax mov rbp,rsp ; rbp = aligned rbp lea rsp, [t2] - collect_args + collect_args 6 %ifdef WIN64 movaps XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8 movaps XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9 @@ -349,7 +349,7 @@ EXTN(jsimd_huff_encode_one_block_sse2): movaps xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD] add rsp, 4*SIZEOF_XMMWORD %endif - uncollect_args + uncollect_args 6 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jcsample-sse2-64.asm b/simd/jcsample-sse2-64.asm index f6b2aa7..29fc982 100644 --- a/simd/jcsample-sse2-64.asm +++ b/simd/jcsample-sse2-64.asm @@ -32,10 +32,10 @@ ; JSAMPARRAY input_data, JSAMPARRAY output_data); ; -; r10 = JDIMENSION image_width +; r10d = JDIMENSION image_width ; r11 = int max_v_samp_factor -; r12 = JDIMENSION v_samp_factor -; r13 = JDIMENSION width_blocks +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_blocks ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data @@ -46,7 +46,7 @@ EXTN(jsimd_h2v1_downsample_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 6 mov ecx, r13d shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) @@ -160,7 +160,7 @@ EXTN(jsimd_h2v1_downsample_sse2): jg near .rowloop .return: - uncollect_args + uncollect_args 6 pop rbp ret @@ -176,10 +176,10 @@ EXTN(jsimd_h2v1_downsample_sse2): ; JSAMPARRAY input_data, JSAMPARRAY output_data); ; -; r10 = JDIMENSION image_width +; r10d = JDIMENSION image_width ; r11 = int max_v_samp_factor -; r12 = JDIMENSION v_samp_factor -; r13 = JDIMENSION width_blocks +; r12d = JDIMENSION v_samp_factor +; r13d = JDIMENSION width_blocks ; r14 = JSAMPARRAY input_data ; r15 = JSAMPARRAY output_data @@ -190,7 +190,7 @@ EXTN(jsimd_h2v2_downsample_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 6 mov ecx, r13d shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols) @@ -320,7 +320,7 @@ EXTN(jsimd_h2v2_downsample_sse2): jg near .rowloop .return: - uncollect_args + uncollect_args 6 pop rbp ret diff --git a/simd/jdcolext-sse2-64.asm b/simd/jdcolext-sse2-64.asm index a42091e..b48b2b7 100644 --- a/simd/jdcolext-sse2-64.asm +++ b/simd/jdcolext-sse2-64.asm @@ -28,11 +28,11 @@ ; JSAMPARRAY output_buf, int num_rows) ; -; r10 = JDIMENSION out_width +; r10d = JDIMENSION out_width ; r11 = JSAMPIMAGE input_buf -; r12 = JDIMENSION input_row +; r12d = JDIMENSION input_row ; r13 = JSAMPARRAY output_buf -; r14 = int num_rows +; r14d = int num_rows %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] %define WK_NUM 2 @@ -48,7 +48,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 5 push rbx mov ecx, r10d ; num_cols @@ -429,7 +429,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2): .return: pop rbx - uncollect_args + uncollect_args 5 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jdmrgext-sse2-64.asm b/simd/jdmrgext-sse2-64.asm index 9e8eb27..93a3ef3 100644 --- a/simd/jdmrgext-sse2-64.asm +++ b/simd/jdmrgext-sse2-64.asm @@ -29,9 +29,9 @@ ; JSAMPARRAY output_buf); ; -; r10 = JDIMENSION output_width +; r10d = JDIMENSION output_width ; r11 = JSAMPIMAGE input_buf -; r12 = JDIMENSION in_row_group_ctr +; r12d = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] @@ -48,7 +48,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 4 push rbx mov ecx, r10d ; col @@ -422,7 +422,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): .return: pop rbx - uncollect_args + uncollect_args 4 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp @@ -439,9 +439,9 @@ EXTN(jsimd_h2v1_merged_upsample_sse2): ; JSAMPARRAY output_buf); ; -; r10 = JDIMENSION output_width +; r10d = JDIMENSION output_width ; r11 = JSAMPIMAGE input_buf -; r12 = JDIMENSION in_row_group_ctr +; r12d = JDIMENSION in_row_group_ctr ; r13 = JSAMPARRAY output_buf align 16 @@ -451,7 +451,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 4 push rbx mov eax, r10d @@ -528,7 +528,7 @@ EXTN(jsimd_h2v2_merged_upsample_sse2): pop rdx pop rbx - uncollect_args + uncollect_args 4 pop rbp ret diff --git a/simd/jdsample-sse2-64.asm b/simd/jdsample-sse2-64.asm index 9a99050..11c3464 100644 --- a/simd/jdsample-sse2-64.asm +++ b/simd/jdsample-sse2-64.asm @@ -53,7 +53,7 @@ PW_EIGHT times 8 dw 8 ; ; r10 = int max_v_samp_factor -; r11 = JDIMENSION downsampled_width +; r11d = JDIMENSION downsampled_width ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr @@ -64,7 +64,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 4 mov eax, r11d ; colctr test rax, rax @@ -175,7 +175,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): jg near .rowloop .return: - uncollect_args + uncollect_args 4 pop rbp ret @@ -192,7 +192,7 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2): ; ; r10 = int max_v_samp_factor -; r11 = JDIMENSION downsampled_width +; r11d = JDIMENSION downsampled_width ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr @@ -210,7 +210,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 4 push rbx mov eax, r11d ; colctr @@ -473,7 +473,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): .return: pop rbx - uncollect_args + uncollect_args 4 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp @@ -492,7 +492,7 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2): ; ; r10 = int max_v_samp_factor -; r11 = JDIMENSION output_width +; r11d = JDIMENSION output_width ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr @@ -503,7 +503,7 @@ EXTN(jsimd_h2v1_upsample_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 4 mov edx, r11d add rdx, byte (2*SIZEOF_XMMWORD)-1 @@ -564,7 +564,7 @@ EXTN(jsimd_h2v1_upsample_sse2): jg short .rowloop .return: - uncollect_args + uncollect_args 4 pop rbp ret @@ -581,7 +581,7 @@ EXTN(jsimd_h2v1_upsample_sse2): ; ; r10 = int max_v_samp_factor -; r11 = JDIMENSION output_width +; r11d = JDIMENSION output_width ; r12 = JSAMPARRAY input_data ; r13 = JSAMPARRAY *output_data_ptr @@ -592,7 +592,7 @@ EXTN(jsimd_h2v2_upsample_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 4 push rbx mov edx, r11d @@ -661,7 +661,7 @@ EXTN(jsimd_h2v2_upsample_sse2): .return: pop rbx - uncollect_args + uncollect_args 4 pop rbp ret diff --git a/simd/jfdctflt-sse-64.asm b/simd/jfdctflt-sse-64.asm index d52568d..cd48134 100644 --- a/simd/jfdctflt-sse-64.asm +++ b/simd/jfdctflt-sse-64.asm @@ -74,7 +74,7 @@ EXTN(jsimd_fdct_float_sse): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 1 ; ---- Pass 1: process rows. @@ -346,7 +346,7 @@ EXTN(jsimd_fdct_float_sse): dec rcx jnz near .columnloop - uncollect_args + uncollect_args 1 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jfdctfst-sse2-64.asm b/simd/jfdctfst-sse2-64.asm index 19aa304..5fbc4d7 100644 --- a/simd/jfdctfst-sse2-64.asm +++ b/simd/jfdctfst-sse2-64.asm @@ -89,7 +89,7 @@ EXTN(jsimd_fdct_ifast_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 1 ; ---- Pass 1: process rows. @@ -380,7 +380,7 @@ EXTN(jsimd_fdct_ifast_sse2): movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6 movdqa XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2 - uncollect_args + uncollect_args 1 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jfdctint-sse2-64.asm b/simd/jfdctint-sse2-64.asm index 0f82cdf..70148f1 100644 --- a/simd/jfdctint-sse2-64.asm +++ b/simd/jfdctint-sse2-64.asm @@ -110,7 +110,7 @@ EXTN(jsimd_fdct_islow_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 1 ; ---- Pass 1: process rows. @@ -610,7 +610,7 @@ EXTN(jsimd_fdct_islow_sse2): movdqa XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1 movdqa XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3 - uncollect_args + uncollect_args 1 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jidctflt-sse2-64.asm b/simd/jidctflt-sse2-64.asm index 9d78fa1..7e714d9 100644 --- a/simd/jidctflt-sse2-64.asm +++ b/simd/jidctflt-sse2-64.asm @@ -64,7 +64,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r10 = void *dct_table ; r11 = JCOEFPTR coef_block ; r12 = JSAMPARRAY output_buf -; r13 = JDIMENSION output_col +; r13d = JDIMENSION output_col %define original_rbp rbp+0 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] @@ -83,7 +83,7 @@ EXTN(jsimd_idct_float_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [workspace] - collect_args + collect_args 4 push rbx ; ---- Pass 1: process columns from input, store into work array. @@ -471,7 +471,7 @@ EXTN(jsimd_idct_float_sse2): jnz near .rowloop pop rbx - uncollect_args + uncollect_args 4 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jidctfst-sse2-64.asm b/simd/jidctfst-sse2-64.asm index 93dd6aa..e7a1617 100644 --- a/simd/jidctfst-sse2-64.asm +++ b/simd/jidctfst-sse2-64.asm @@ -85,7 +85,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r10 = jpeg_component_info *compptr ; r11 = JCOEFPTR coef_block ; r12 = JSAMPARRAY output_buf -; r13 = JDIMENSION output_col +; r13d = JDIMENSION output_col %define original_rbp rbp+0 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] @@ -102,7 +102,7 @@ EXTN(jsimd_idct_ifast_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 4 ; ---- Pass 1: process columns from input. @@ -479,7 +479,7 @@ EXTN(jsimd_idct_ifast_sse2): movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 - uncollect_args + uncollect_args 4 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jidctint-sse2-64.asm b/simd/jidctint-sse2-64.asm index 6331181..a8cbce8 100644 --- a/simd/jidctint-sse2-64.asm +++ b/simd/jidctint-sse2-64.asm @@ -98,7 +98,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r10 = jpeg_component_info *compptr ; r11 = JCOEFPTR coef_block ; r12 = JSAMPARRAY output_buf -; r13 = JDIMENSION output_col +; r13d = JDIMENSION output_col %define original_rbp rbp+0 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] @@ -115,7 +115,7 @@ EXTN(jsimd_idct_islow_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 4 ; ---- Pass 1: process columns from input. @@ -836,7 +836,7 @@ EXTN(jsimd_idct_islow_sse2): movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5 - uncollect_args + uncollect_args 4 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp diff --git a/simd/jidctred-sse2-64.asm b/simd/jidctred-sse2-64.asm index 31a3f36..dace694 100644 --- a/simd/jidctred-sse2-64.asm +++ b/simd/jidctred-sse2-64.asm @@ -106,7 +106,7 @@ PB_CENTERJSAMP times 16 db CENTERJSAMPLE ; r10 = void *dct_table ; r11 = JCOEFPTR coef_block ; r12 = JSAMPARRAY output_buf -; r13 = JDIMENSION output_col +; r13d = JDIMENSION output_col %define original_rbp rbp+0 %define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] @@ -123,7 +123,7 @@ EXTN(jsimd_idct_4x4_sse2): mov [rsp], rax mov rbp, rsp ; rbp = aligned rbp lea rsp, [wk(0)] - collect_args + collect_args 4 ; ---- Pass 1: process columns from input. @@ -389,7 +389,7 @@ EXTN(jsimd_idct_4x4_sse2): movd XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 movd XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 - uncollect_args + uncollect_args 4 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp pop rbp @@ -409,7 +409,7 @@ EXTN(jsimd_idct_4x4_sse2): ; r10 = void *dct_table ; r11 = JCOEFPTR coef_block ; r12 = JSAMPARRAY output_buf -; r13 = JDIMENSION output_col +; r13d = JDIMENSION output_col align 16 global EXTN(jsimd_idct_2x2_sse2) @@ -418,7 +418,7 @@ EXTN(jsimd_idct_2x2_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 4 push rbx ; ---- Pass 1: process columns from input. @@ -566,7 +566,7 @@ EXTN(jsimd_idct_2x2_sse2): mov WORD [rsi+rax*SIZEOF_JSAMPLE], cx pop rbx - uncollect_args + uncollect_args 4 pop rbp ret diff --git a/simd/jquantf-sse2-64.asm b/simd/jquantf-sse2-64.asm index ed69bc2..241b857 100644 --- a/simd/jquantf-sse2-64.asm +++ b/simd/jquantf-sse2-64.asm @@ -31,7 +31,7 @@ ; ; r10 = JSAMPARRAY sample_data -; r11 = JDIMENSION start_col +; r11d = JDIMENSION start_col ; r12 = FAST_FLOAT *workspace align 16 @@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_float_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 3 push rbx pcmpeqw xmm7, xmm7 @@ -90,7 +90,7 @@ EXTN(jsimd_convsamp_float_sse2): jnz short .convloop pop rbx - uncollect_args + uncollect_args 3 pop rbp ret @@ -115,7 +115,7 @@ EXTN(jsimd_quantize_float_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 3 mov rsi, r12 mov rdx, r11 @@ -148,7 +148,7 @@ EXTN(jsimd_quantize_float_sse2): dec rax jnz short .quantloop - uncollect_args + uncollect_args 3 pop rbp ret diff --git a/simd/jquanti-sse2-64.asm b/simd/jquanti-sse2-64.asm index 7cb6872..0f8f9ec 100644 --- a/simd/jquanti-sse2-64.asm +++ b/simd/jquanti-sse2-64.asm @@ -31,7 +31,7 @@ ; ; r10 = JSAMPARRAY sample_data -; r11 = JDIMENSION start_col +; r11d = JDIMENSION start_col ; r12 = DCTELEM *workspace align 16 @@ -41,7 +41,7 @@ EXTN(jsimd_convsamp_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 3 push rbx pxor xmm6, xmm6 ; xmm6=(all 0's) @@ -85,7 +85,7 @@ EXTN(jsimd_convsamp_sse2): jnz short .convloop pop rbx - uncollect_args + uncollect_args 3 pop rbp ret @@ -117,7 +117,7 @@ EXTN(jsimd_quantize_sse2): push rbp mov rax, rsp mov rbp, rsp - collect_args + collect_args 3 mov rsi, r12 mov rdx, r11 @@ -177,7 +177,7 @@ EXTN(jsimd_quantize_sse2): dec rax jnz near .quantloop - uncollect_args + uncollect_args 3 pop rbp ret diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc index 7c0b809..c138f01 100644 --- a/simd/jsimdext.inc +++ b/simd/jsimdext.inc @@ -307,61 +307,99 @@ const_base: %ifdef WIN64 -%imacro collect_args 0 - push r12 - push r13 - push r14 - push r15 +%imacro collect_args 1 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm6 + sub rsp, SIZEOF_XMMWORD + movaps XMMWORD [rsp], xmm7 mov r10, rcx +%if %1 > 1 mov r11, rdx +%endif +%if %1 > 2 + push r12 mov r12, r8 +%endif +%if %1 > 3 + push r13 mov r13, r9 +%endif +%if %1 > 4 + push r14 mov r14, [rax+48] +%endif +%if %1 > 5 + push r15 mov r15, [rax+56] +%endif push rsi push rdi - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm6 - sub rsp, SIZEOF_XMMWORD - movaps XMMWORD [rsp], xmm7 %endmacro -%imacro uncollect_args 0 - movaps xmm7, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD - movaps xmm6, XMMWORD [rsp] - add rsp, SIZEOF_XMMWORD +%imacro uncollect_args 1 pop rdi pop rsi +%if %1 > 5 pop r15 +%endif +%if %1 > 4 pop r14 +%endif +%if %1 > 3 pop r13 +%endif +%if %1 > 2 pop r12 +%endif + movaps xmm7, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD + movaps xmm6, XMMWORD [rsp] + add rsp, SIZEOF_XMMWORD %endmacro %else -%imacro collect_args 0 +%imacro collect_args 1 push r10 - push r11 - push r12 - push r13 - push r14 - push r15 mov r10, rdi +%if %1 > 1 + push r11 mov r11, rsi +%endif +%if %1 > 2 + push r12 mov r12, rdx +%endif +%if %1 > 3 + push r13 mov r13, rcx +%endif +%if %1 > 4 + push r14 mov r14, r8 +%endif +%if %1 > 5 + push r15 mov r15, r9 +%endif %endmacro -%imacro uncollect_args 0 +%imacro uncollect_args 1 +%if %1 > 5 pop r15 +%endif +%if %1 > 4 pop r14 +%endif +%if %1 > 3 pop r13 +%endif +%if %1 > 2 pop r12 +%endif +%if %1 > 1 pop r11 +%endif pop r10 %endmacro -- 2.50.1