From a7c2f979390706e3b834c80943ad490f34bbef16 Mon Sep 17 00:00:00 2001 From: DRC Date: Fri, 8 Jul 2016 20:10:24 -0500 Subject: [PATCH] AVX2: Avoid expensive AVX-SSE transitions Refer to https://software.intel.com/sites/default/files/m/d/4/1/d/8/11MC12_Avoiding_2BAVX-SSE_2BTransition_2BPenalties_2Brh_2Bfinal.pdf for more information. This eliminates all AVX-SSE transitions detected with the Intel SDE tool. --- simd/jccolext-avx2-64.asm | 1 + simd/jccolext-avx2.asm | 1 + simd/jcgryext-avx2-64.asm | 1 + simd/jcgryext-avx2.asm | 1 + simd/jcsample-avx2-64.asm | 2 ++ simd/jcsample-avx2.asm | 2 ++ simd/jdcolext-avx2-64.asm | 1 + simd/jdcolext-avx2.asm | 1 + simd/jdmrgext-avx2-64.asm | 1 + simd/jdmrgext-avx2.asm | 1 + simd/jdsample-avx2-64.asm | 4 ++++ simd/jdsample-avx2.asm | 4 ++++ 12 files changed, 20 insertions(+) diff --git a/simd/jccolext-avx2-64.asm b/simd/jccolext-avx2-64.asm index 2a8e9ba..a7e977a 100644 --- a/simd/jccolext-avx2-64.asm +++ b/simd/jccolext-avx2-64.asm @@ -550,6 +550,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2): .return: pop rbx + vzeroupper uncollect_args 5 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp diff --git a/simd/jccolext-avx2.asm b/simd/jccolext-avx2.asm index d4c40e0..19e05c0 100644 --- a/simd/jccolext-avx2.asm +++ b/simd/jccolext-avx2.asm @@ -565,6 +565,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2): jg near .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved diff --git a/simd/jcgryext-avx2-64.asm b/simd/jcgryext-avx2-64.asm index ea42598..8824a45 100644 --- a/simd/jcgryext-avx2-64.asm +++ b/simd/jcgryext-avx2-64.asm @@ -428,6 +428,7 @@ EXTN(jsimd_rgb_gray_convert_avx2): .return: pop rbx + vzeroupper uncollect_args 5 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp diff --git a/simd/jcgryext-avx2.asm b/simd/jcgryext-avx2.asm index 81afbec..4f81c20 100644 --- a/simd/jcgryext-avx2.asm +++ b/simd/jcgryext-avx2.asm @@ -443,6 +443,7 @@ EXTN(jsimd_rgb_gray_convert_avx2): jg near .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved diff --git a/simd/jcsample-avx2-64.asm b/simd/jcsample-avx2-64.asm index e45cc6d..d7877ea 100644 --- a/simd/jcsample-avx2-64.asm +++ b/simd/jcsample-avx2-64.asm @@ -177,6 +177,7 @@ EXTN(jsimd_h2v1_downsample_avx2): jg near .rowloop .return: + vzeroupper uncollect_args 6 pop rbp ret @@ -355,6 +356,7 @@ EXTN(jsimd_h2v2_downsample_avx2): jg near .rowloop .return: + vzeroupper uncollect_args 6 pop rbp ret diff --git a/simd/jcsample-avx2.asm b/simd/jcsample-avx2.asm index e94823e..9efd415 100644 --- a/simd/jcsample-avx2.asm +++ b/simd/jcsample-avx2.asm @@ -184,6 +184,7 @@ EXTN(jsimd_h2v1_downsample_avx2): jg near .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved @@ -373,6 +374,7 @@ EXTN(jsimd_h2v2_downsample_avx2): jg near .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved diff --git a/simd/jdcolext-avx2-64.asm b/simd/jdcolext-avx2-64.asm index c76c6b7..ca81ad8 100644 --- a/simd/jdcolext-avx2-64.asm +++ b/simd/jdcolext-avx2-64.asm @@ -485,6 +485,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): .return: pop rbx + vzeroupper uncollect_args 5 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp diff --git a/simd/jdcolext-avx2.asm b/simd/jdcolext-avx2.asm index ffe2428..1222b77 100644 --- a/simd/jdcolext-avx2.asm +++ b/simd/jdcolext-avx2.asm @@ -500,6 +500,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2): sfence ; flush the write buffer .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved diff --git a/simd/jdmrgext-avx2-64.asm b/simd/jdmrgext-avx2-64.asm index d8f3896..546a9e5 100644 --- a/simd/jdmrgext-avx2-64.asm +++ b/simd/jdmrgext-avx2-64.asm @@ -479,6 +479,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): .return: pop rbx + vzeroupper uncollect_args 4 mov rsp, rbp ; rsp <- aligned rbp pop rsp ; rsp <- original rbp diff --git a/simd/jdmrgext-avx2.asm b/simd/jdmrgext-avx2.asm index f365fa1..5f2b567 100644 --- a/simd/jdmrgext-avx2.asm +++ b/simd/jdmrgext-avx2.asm @@ -493,6 +493,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2): sfence ; flush the write buffer .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved diff --git a/simd/jdsample-avx2-64.asm b/simd/jdsample-avx2-64.asm index 654722b..f3384ca 100644 --- a/simd/jdsample-avx2-64.asm +++ b/simd/jdsample-avx2-64.asm @@ -186,6 +186,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): jg near .rowloop .return: + vzeroupper uncollect_args 4 pop_xmm 3 pop rbp @@ -497,6 +498,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): .return: pop rbx + vzeroupper uncollect_args 4 pop_xmm 3 mov rsp, rbp ; rsp <- aligned rbp @@ -590,6 +592,7 @@ EXTN(jsimd_h2v1_upsample_avx2): jg short .rowloop .return: + vzeroupper uncollect_args 4 pop rbp ret @@ -688,6 +691,7 @@ EXTN(jsimd_h2v2_upsample_avx2): .return: pop rbx + vzeroupper uncollect_args 4 pop rbp ret diff --git a/simd/jdsample-avx2.asm b/simd/jdsample-avx2.asm index 26fa1a9..d087e3d 100644 --- a/simd/jdsample-avx2.asm +++ b/simd/jdsample-avx2.asm @@ -193,6 +193,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2): jg near .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved @@ -540,6 +541,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2): jg near .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved @@ -642,6 +644,7 @@ EXTN(jsimd_h2v1_upsample_avx2): jg short .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved @@ -748,6 +751,7 @@ EXTN(jsimd_h2v2_upsample_avx2): jg near .rowloop .return: + vzeroupper pop edi pop esi ; pop edx ; need not be preserved -- 2.40.0