]> granicus.if.org Git - libjpeg-turbo/commitdiff
AVX2: Avoid expensive AVX-SSE transitions
authorDRC <information@libjpeg-turbo.org>
Sat, 9 Jul 2016 01:10:24 +0000 (20:10 -0500)
committerDRC <information@libjpeg-turbo.org>
Sat, 9 Jul 2016 01:10:24 +0000 (20:10 -0500)
Refer to
https://software.intel.com/sites/default/files/m/d/4/1/d/8/11MC12_Avoiding_2BAVX-SSE_2BTransition_2BPenalties_2Brh_2Bfinal.pdf
for more information.  This eliminates all AVX-SSE transitions detected
with the Intel SDE tool.

12 files changed:
simd/jccolext-avx2-64.asm
simd/jccolext-avx2.asm
simd/jcgryext-avx2-64.asm
simd/jcgryext-avx2.asm
simd/jcsample-avx2-64.asm
simd/jcsample-avx2.asm
simd/jdcolext-avx2-64.asm
simd/jdcolext-avx2.asm
simd/jdmrgext-avx2-64.asm
simd/jdmrgext-avx2.asm
simd/jdsample-avx2-64.asm
simd/jdsample-avx2.asm

index 2a8e9ba1d943733df9c0abfd23d0fbfe78a8bffe..a7e977a4c590259ddb5d8fe3e0a27dd5bae4eb43 100644 (file)
@@ -550,6 +550,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
 
 .return:
     pop         rbx
+    vzeroupper
     uncollect_args 5
     mov         rsp, rbp                ; rsp <- aligned rbp
     pop         rsp                     ; rsp <- original rbp
index d4c40e0a4bd6d39c041e628545c9a72a53a39918..19e05c0d67288812a1c91e6c0bb6265c1fc0e3bd 100644 (file)
@@ -565,6 +565,7 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
index ea425987662fbb535f40215a4087c64805b3a898..8824a453f71918e386e97afcfcb49aec7839c51d 100644 (file)
@@ -428,6 +428,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
 
 .return:
     pop         rbx
+    vzeroupper
     uncollect_args 5
     mov         rsp, rbp                ; rsp <- aligned rbp
     pop         rsp                     ; rsp <- original rbp
index 81afbec7bf1ca3ecf97cd5ae15d8ebb5cbdf7532..4f81c20072a4205a5f1ba4eeba88c2ffaae06063 100644 (file)
@@ -443,6 +443,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
index e45cc6d08c146ad2a08fd55151054c6ba492217e..d7877ea6677846773609c98943e42c33c37e9fe2 100644 (file)
@@ -177,6 +177,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     uncollect_args 6
     pop         rbp
     ret
@@ -355,6 +356,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     uncollect_args 6
     pop         rbp
     ret
index e94823e9ebe22ea31f22b6c1249650e3388f2a0d..9efd415c0c3564d448c93f374628fb1580dd797e 100644 (file)
@@ -184,6 +184,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
@@ -373,6 +374,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
index c76c6b7e308e8d4fc1feba46931f966375785bf7..ca81ad84cc625a5ac7d900e28621278fcf22cd9a 100644 (file)
@@ -485,6 +485,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
 
 .return:
     pop         rbx
+    vzeroupper
     uncollect_args 5
     mov         rsp, rbp                ; rsp <- aligned rbp
     pop         rsp                     ; rsp <- original rbp
index ffe2428a3575fd74613599b36ff8976f447aeec3..1222b7706a96df8f122d1e906bd1c6983dbc249c 100644 (file)
@@ -500,6 +500,7 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     sfence                              ; flush the write buffer
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
index d8f389624eec1d8702b90d32219bf966952beebe..546a9e5ea0c1f5ffc93402411707ac3c1d1beb62 100644 (file)
@@ -479,6 +479,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
 
 .return:
     pop         rbx
+    vzeroupper
     uncollect_args 4
     mov         rsp, rbp                ; rsp <- aligned rbp
     pop         rsp                     ; rsp <- original rbp
index f365fa118fb5584cabcaa01b58f4fcada9480235..5f2b56752374b006d17995909055d6c4c0203ae0 100644 (file)
@@ -493,6 +493,7 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
     sfence                              ; flush the write buffer
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
index 654722bdf9cadaf27b90772c01608258e66cc055..f3384ca82566c6bd18e82983c8b7fd16f660a5d8 100644 (file)
@@ -186,6 +186,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     uncollect_args 4
     pop_xmm     3
     pop         rbp
@@ -497,6 +498,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
 
 .return:
     pop         rbx
+    vzeroupper
     uncollect_args 4
     pop_xmm     3
     mov         rsp, rbp                ; rsp <- aligned rbp
@@ -590,6 +592,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
     jg          short .rowloop
 
 .return:
+    vzeroupper
     uncollect_args 4
     pop         rbp
     ret
@@ -688,6 +691,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
 
 .return:
     pop         rbx
+    vzeroupper
     uncollect_args 4
     pop         rbp
     ret
index 26fa1a9a5c6839beab9df2258ed3e6d54dcc2a24..d087e3d7ff6d84a33c9c373224457fa41741b2df 100644 (file)
@@ -193,6 +193,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
@@ -540,6 +541,7 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
@@ -642,6 +644,7 @@ EXTN(jsimd_h2v1_upsample_avx2):
     jg          short .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved
@@ -748,6 +751,7 @@ EXTN(jsimd_h2v2_upsample_avx2):
     jg          near .rowloop
 
 .return:
+    vzeroupper
     pop         edi
     pop         esi
 ;   pop         edx                     ; need not be preserved