From c4558efbf3a44a1b5e68dce46347dd3888db4760 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 14 Feb 2013 15:39:42 +0100 Subject: [PATCH] sha512-x86_64.pl: add AVX2 code path. --- crypto/sha/asm/sha512-x86_64.pl | 650 ++++++++++++++++++++++++++++---- 1 file changed, 568 insertions(+), 82 deletions(-) diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index 8f10f973e9..da094f0b3c 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -59,6 +59,15 @@ # higher coefficients are observed on VIA Nano and Bulldozer has more # to do with specifics of their architecture [which is topic for # separate discussion]. +# +# November 2012. +# +# Add AVX2 code path. Two consecutive input blocks are loaded to +# 256-bit %ymm registers, with data from first block to least +# significant 128-bit halves and data from second to most significant. +# The data is then processed with same SIMD instruction sequence as +# for AVX, but with %ymm as operands. Side effect is increased stack +# frame, 448 additional bytes in SHA256 and 1152 in SHA512. ###################################################################### # Current performance in cycles per processed byte (less is better): @@ -69,13 +78,13 @@ # P4 17.5 - - 33.4 - # Core 2 15.5 13.9(+11%) - 10.3 - # Westmere 15.1 12.5(+21%) - 9.72 - -# Atom 23.0 21.6(+6%) - 14.7 - -# VIA Nano 23.0 16.3(+41%) - 14.7 - # Sandy Bridge 17.4 14.0(+24%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Ivy Bridge 12.6 10.3(+22%) 10.3(+22%) 8.17 7.22(+13%) # Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%) +# VIA Nano 23.0 16.3(+41%) - 14.7 - +# Atom 23.0 21.6(+6%) - 14.7 - # -# (*) whichever applicable; +# (*) whichever best applicable; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions @@ -93,15 +102,20 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/ && - $1>=2.19); -$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && - $1>=2.09); -$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./ && - $1>=10); +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; @@ -145,6 +159,8 @@ $framesz="16*$SZ+4*8"; sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); $code.=<<___; ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 @@ -186,7 +202,7 @@ $code.=<<___ if ($i>=15); mov `$SZ*(($i+2)&0xf)`(%rsp),$a0 ___ $code.=<<___; - lea $SZ($Tbl),$Tbl # round++ + lea $STRIDE($Tbl),$Tbl # round++ add $a1,$h # h+=Sigma0(a) ___ @@ -229,28 +245,34 @@ $code=<<___; .extern OPENSSL_ia32cap_P .globl $func -.type $func,\@function,4 +.type $func,\@function,3 .align 16 $func: ___ $code.=<<___ if ($SZ==4 || $avx); lea OPENSSL_ia32cap_P(%rip),%r11 - mov 0(%r11),%r10d - mov 4(%r11),%r11d + mov 0(%r11),%r9d + mov 4(%r11),%r10d + mov 8(%r11),%r11d ___ $code.=<<___ if ($avx && $SZ==8); - test \$`1<<11`,%r11d # check for XOP + test \$`1<<11`,%r10d # check for XOP jnz .Lxop_shortcut ___ +$code.=<<___ if ($avx>1); + and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 + cmp \$`1<<8|1<<5|1<<3`,%r11d + je .Lavx2_shortcut +___ $code.=<<___ if ($avx); - and \$`1<<30`,%r10d # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r11d # mask AVX and SSSE3 bits - or %r10d,%r11d - cmp \$`1<<28|1<<9|1<<30`,%r11d + and \$`1<<30`,%r9d # mask "Intel CPU" bit + and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits + or %r9d,%r10d + cmp \$`1<<28|1<<9|1<<30`,%r10d je .Lavx_shortcut ___ $code.=<<___ if ($SZ==4); - test \$`1<<9`,%r11d + test \$`1<<9`,%r10d jnz .Lssse3_shortcut ___ $code.=<<___; @@ -352,24 +374,43 @@ $code.=<<___; .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " ___ @@ -379,48 +420,89 @@ $code.=<<___; .type $TABLE,\@object $TABLE: .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f - .asciz "SHA512 block transfort for x86_64, CRYPTOGAMS by " + .quad 0x0001020304050607,0x08090a0b0c0d0e0f + .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " ___ } @@ -489,7 +571,7 @@ my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; -.type ${func}_ssse3,\@function,4 +.type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: .Lssse3_shortcut: @@ -529,12 +611,12 @@ $code.=<<___; ___ $code.=<<___; - movdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 - movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 + movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: - movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 movdqu 0x00($inp),@X[0] movdqu 0x10($inp),@X[1] movdqu 0x20($inp),@X[2] @@ -544,11 +626,11 @@ $code.=<<___; pshufb $t3,@X[1] movdqa 0x00($Tbl),$t0 pshufb $t3,@X[2] - movdqa 0x10($Tbl),$t1 + movdqa 0x20($Tbl),$t1 paddd @X[0],$t0 - movdqa 0x20($Tbl),$t2 + movdqa 0x40($Tbl),$t2 pshufb $t3,@X[3] - movdqa 0x30($Tbl),$t3 + movdqa 0x60($Tbl),$t3 paddd @X[1],$t1 paddd @X[2],$t2 paddd @X[3],$t3 @@ -564,7 +646,7 @@ $code.=<<___; .align 16 .Lssse3_00_47: - add \$16*$SZ,$Tbl + sub \$-16*2*$SZ,$Tbl # size optimization ___ sub Xupdate_256_SSSE3 () { ( @@ -601,7 +683,7 @@ sub Xupdate_256_SSSE3 () { '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2);', - '&movdqa ($t2,16*$j."($Tbl)")', + '&movdqa ($t2,16*2*$j."($Tbl)")', '&pshufb ($t3,$t5)', '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) ); @@ -744,7 +826,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); - &movdqa ($t2,16*$j."($Tbl)"); + &movdqa ($t2,16*2*$j."($Tbl)"); eval(shift(@insns)); #@ eval(shift(@insns)); &pshufb ($t3,$t5); @@ -767,7 +849,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &SSSE3_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lssse3_00_47"); for ($i=0; $i<16; ) { @@ -827,7 +909,7 @@ if ($avx) {{ # if ($SZ==8) { # SHA512 only $code.=<<___; -.type ${func}_xop,\@function,4 +.type ${func}_xop,\@function,3 .align 64 ${func}_xop: .Lxop_shortcut: @@ -878,7 +960,7 @@ ___ $code.=<<___; .align 16 .Lloop_xop: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] @@ -889,9 +971,9 @@ $code.=<<___; vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] - vpaddd 0x10($Tbl),@X[1],$t1 - vpaddd 0x20($Tbl),@X[2],$t2 - vpaddd 0x30($Tbl),@X[3],$t3 + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) @@ -904,7 +986,7 @@ $code.=<<___; .align 16 .Lxop_00_47: - add \$16*$SZ,$Tbl + sub \$-16*2*$SZ,$Tbl # size optimization ___ sub XOP_256_00_47 () { my $j = shift; @@ -1001,7 +1083,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpaddd ($t2,@X[0],16*$j."($Tbl)"); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1010,7 +1092,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { @@ -1024,9 +1106,9 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions $code.=<<___; .align 16 .Lloop_xop: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] - lea $TABLE(%rip),$Tbl + lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] @@ -1040,20 +1122,20 @@ $code.=<<___; vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] - vpaddq 0x00($Tbl),@X[0],$t0 + vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] - vpaddq 0x10($Tbl),@X[1],$t1 + vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] - vpaddq 0x20($Tbl),@X[2],$t2 - vpaddq 0x30($Tbl),@X[3],$t3 + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) - vpaddq 0x40($Tbl),@X[4],$t0 + vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) - vpaddq 0x50($Tbl),@X[5],$t1 + vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) - vpaddq 0x60($Tbl),@X[6],$t2 + vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) - vpaddq 0x70($Tbl),@X[7],$t3 + vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) @@ -1066,7 +1148,7 @@ $code.=<<___; .align 16 .Lxop_00_47: - add \$16*$SZ,$Tbl + add \$16*2*$SZ,$Tbl ___ sub XOP_512_00_47 () { my $j = shift; @@ -1129,7 +1211,7 @@ my @insns = (&$body,&$body); # 52 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpaddq ($t2,@X[0],16*$j."($Tbl)"); + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1138,7 +1220,7 @@ my @insns = (&$body,&$body); # 52 instructions &XOP_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { @@ -1203,7 +1285,7 @@ ___ local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; -.type ${func}_avx,\@function,4 +.type ${func}_avx,\@function,3 .align 64 ${func}_avx: .Lavx_shortcut: @@ -1251,12 +1333,12 @@ ___ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; - vmovdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 - vmovdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_avx .align 16 .Lloop_avx: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] @@ -1267,9 +1349,9 @@ $code.=<<___; vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] - vpaddd 0x10($Tbl),@X[1],$t1 - vpaddd 0x20($Tbl),@X[2],$t2 - vpaddd 0x30($Tbl),@X[3],$t3 + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) @@ -1282,7 +1364,7 @@ $code.=<<___; .align 16 .Lavx_00_47: - add \$16*$SZ,$Tbl + sub \$-16*2*$SZ,$Tbl # size optimization ___ sub Xupdate_256_AVX () { ( @@ -1330,7 +1412,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); } - &vpaddd ($t2,@X[0],16*$j."($Tbl)"); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1339,7 +1421,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { @@ -1354,9 +1436,9 @@ $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] - lea $TABLE(%rip),$Tbl + lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] @@ -1370,20 +1452,20 @@ $code.=<<___; vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] - vpaddq 0x00($Tbl),@X[0],$t0 + vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] - vpaddq 0x10($Tbl),@X[1],$t1 + vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] - vpaddq 0x20($Tbl),@X[2],$t2 - vpaddq 0x30($Tbl),@X[3],$t3 + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) - vpaddq 0x40($Tbl),@X[4],$t0 + vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) - vpaddq 0x50($Tbl),@X[5],$t1 + vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) - vpaddq 0x60($Tbl),@X[6],$t2 + vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) - vpaddq 0x70($Tbl),@X[7],$t3 + vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) @@ -1396,14 +1478,14 @@ $code.=<<___; .align 16 .Lavx_00_47: - add \$16*$SZ,$Tbl + add \$16*2*$SZ,$Tbl ___ sub Xupdate_512_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] - '&vpsrlq ($t2,$t0,$sigma0[0]);', - '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += X[9..10] + '&vpsrlq ($t2,$t0,$sigma0[0])', + '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] '&vpsrlq ($t3,$t0,$sigma0[2])', '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', @@ -1413,7 +1495,7 @@ sub Xupdate_512_AVX () { '&vpxor ($t0,$t0,$t2)', '&vpsrlq ($t3,@X[7],$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) - '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1])', + '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) '&vpsrlq ($t1,@X[7],$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', @@ -1437,7 +1519,7 @@ my @insns = (&$body,&$body); # 52 instructions eval(shift(@insns)); eval(shift(@insns)); } - &vpaddq ($t2,@X[0],16*$j."($Tbl)"); + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1446,7 +1528,7 @@ my @insns = (&$body,&$body); # 52 instructions &AVX_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { @@ -1504,6 +1586,389 @@ $code.=<<___; ret .size ${func}_avx,.-${func}_avx ___ + +if ($avx>1) {{ +###################################################################### +# AVX2+BMI code path +# +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $PUSH8=8*2*$SZ; +use integer; + +sub bodyx_00_15 () { + # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] + '&and ($a4,$e)', # f&e + '&rorx ($a0,$e,$Sigma1[2])', + '&rorx ($a2,$e,$Sigma1[1])', + + '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past + '&lea ($h,"($h,$a4)")', + '&andn ($a4,$e,$g)', # ~e&g + '&xor ($a0,$a2)', + + '&rorx ($a1,$e,$Sigma1[0])', + '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) + '&xor ($a0,$a1)', # Sigma1(e) + '&mov ($a2,$a)', + + '&rorx ($a4,$a,$Sigma0[2])', + '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) + '&xor ($a2,$b)', # a^b, b^c in next round + '&rorx ($a1,$a,$Sigma0[1])', + + '&rorx ($a0,$a,$Sigma0[0])', + '&lea ($d,"($d,$h)")', # d+=h + '&and ($a3,$a2)', # (b^c)&(a^b) + '&xor ($a1,$a4)', + + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + '&xor ($a1,$a0)', # Sigma0(a) + '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) + '&mov ($a4,$e)', # copy of f in future + + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); + # and at the finish one has to $a+=$a1 +} + +$code.=<<___; +.type ${func}_avx2,\@function,3 +.align 64 +${func}_avx2: +.Lavx2_shortcut: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp + shl \$4,%rdx # num*16 + and \$-256*$SZ,%rsp # align stack frame + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + add \$`2*$SZ*($rounds-8)`,%rsp + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %r11,$_rsp # save copy of %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps %xmm10,16*$SZ+96(%rsp) + movaps %xmm11,16*$SZ+112(%rsp) +___ +$code.=<<___; +.Lprologue_avx2: + + vzeroall + sub \$-16*$SZ,$inp # inp++, size optimization + mov $SZ*0($ctx),$A + xor %r12,%r12 # borrow $T1 + mov $SZ*1($ctx),$B + cmp %rdx,$inp # $_end + mov $SZ*2($ctx),$C + sete %r12b + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + if ($SZ==4) { # SHA256 + my @X = map("%ymm$_",(0..3)); + my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); + +$code.=<<___; + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + shl \$`log(16*$SZ)/log(2)`,%r12 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + neg %r12 + vmovdqu -16*$SZ+0($inp),$t0 + add $inp,%r12 + vmovdqu -16*$SZ+32($inp),$t1 + vmovdqu (%r12),@X[2] # next or same input block + vmovdqu 32(%r12),@X[3] + + vperm2i128 \$0x20,@X[2],$t0,@X[0] + #mov $inp,$_inp # offload $inp + vperm2i128 \$0x31,@X[2],$t0,@X[1] + vperm2i128 \$0x20,@X[3],$t1,@X[2] + vperm2i128 \$0x31,@X[3],$t1,@X[3] + + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[0],@X[0] + vpshufb $t3,@X[1],@X[1] + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + lea -$PUSH8(%rsp),%rsp + mov $B,$a3 + vmovdqa $t2,0x00(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x20(%rsp) + mov $F,$a4 + sub \$-16*2*$SZ,$Tbl # size optimization + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 96 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &AVX2_256_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } + } else { # SHA512 + my @X = map("%ymm$_",(0..7)); + my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); + +$code.=<<___; + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + shl \$`log(16*$SZ)/log(2)`,%r12 + vmovdqu -16*$SZ($inp),$t0 + neg %r12 + vmovdqu -16*$SZ+32($inp),$t1 + add $inp,%r12 + vmovdqu -16*$SZ+64($inp),$t2 + vmovdqu -16*$SZ+96($inp),$t3 + vmovdqu (%r12),@X[4] # next or same block + vmovdqu 32(%r12),@X[5] + vmovdqu 64(%r12),@X[6] + vmovdqu 96(%r12),@X[7] + + vperm2i128 \$0x20,@X[4],$t0,@X[0] + #mov $inp,$_inp # offload $inp + vperm2i128 \$0x31,@X[4],$t0,@X[1] + vperm2i128 \$0x20,@X[5],$t1,@X[2] + vperm2i128 \$0x31,@X[5],$t1,@X[3] + vperm2i128 \$0x20,@X[6],$t2,@X[4] + vperm2i128 \$0x31,@X[6],$t2,@X[5] + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t2 + vperm2i128 \$0x20,@X[7],$t3,@X[6] + vperm2i128 \$0x31,@X[7],$t3,@X[7] + + vpshufb $t2,@X[0],@X[0] + lea $TABLE+0x80(%rip),$Tbl # size optimization + vpshufb $t2,@X[1],@X[1] + vpshufb $t2,@X[2],@X[2] + vpshufb $t2,@X[3],@X[3] + vpshufb $t2,@X[4],@X[4] + vpshufb $t2,@X[5],@X[5] + vpaddq -0x80($Tbl),@X[0],$t0 + vpshufb $t2,@X[6],@X[6] + vpaddq -0x60($Tbl),@X[1],$t1 + vpshufb $t2,@X[7],@X[7] + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + vpaddq 0x00($Tbl),@X[4],$t0 + vmovdqa $t1,0x20(%rsp) + vpaddq 0x20($Tbl),@X[5],$t1 + vmovdqa $t2,0x40(%rsp) + vpaddq 0x40($Tbl),@X[6],$t2 + vmovdqa $t3,0x60(%rsp) + lea -$PUSH8(%rsp),%rsp + vpaddq 0x60($Tbl),@X[7],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + mov $B,$a3 + vmovdqa $t2,0x40(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x60(%rsp) + mov $F,$a4 + add \$16*2*$SZ,$Tbl + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_512_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body); # 48 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); + foreach (Xupdate_512_AVX()) { # 23 instructions + eval; + if ($_ !~ /\;$/) { + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<8; $j++) { + &AVX2_512_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1-0x80)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } +} +$code.=<<___; + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),$Tbl + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + cmp `$PUSH8+2*8`($Tbl),$inp # $_end + je .Ldone_avx2 + + xor $a1,$a1 + mov $B,$a3 + xor $C,$a3 # magic + mov $F,$a4 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: +___ + for ($i=0; $i<8; ) { + my $base="+16($Tbl)"; + foreach(bodyx_00_15()) { eval; } + } +$code.=<<___; + lea -$PUSH8($Tbl),$Tbl + cmp %rsp,$Tbl + jae .Lower_avx2 + + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),%rsp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + lea `2*16*$SZ`($inp),$inp # inp+=2 + add $SZ*6($ctx),$G + xor %r12,%r12 + add $SZ*7($ctx),$H + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + sete %r12b + jbe .Loop_avx2 + lea (%rsp),$Tbl + +.Ldone_avx2: + lea ($Tbl),%rsp + mov $_rsp,%rsi + vzeroall +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps 16*$SZ+96(%rsp),%xmm10 + movaps 16*$SZ+112(%rsp),%xmm11 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx2: + ret +.size ${func}_avx2,.-${func}_avx2 +___ +}} }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, @@ -1547,7 +2012,17 @@ se_handler: lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - +___ +$code.=<<___ if ($avx>1); + lea .Lavx2_shortcut(%rip),%r10 + cmp %r10,%rbx # context->Rip1); + .rva .LSEH_begin_${func}_avx2 + .rva .LSEH_end_${func}_avx2 + .rva .LSEH_info_${func}_avx2 +___ $code.=<<___; .section .xdata .align 8 @@ -1661,6 +2141,12 @@ $code.=<<___ if ($avx); .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ +$code.=<<___ if ($avx>1); +.LSEH_info_${func}_avx2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] +___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; -- 2.40.0