push %r15
shl \$3,${num}d # convert $num to bytes
+ .byte 0x67
xor %r10,%r10
mov %rsp,%r11 # put aside %rsp
sub $num,%r10 # -$num
mov ($bp),%rdx # b[0], $bp==%rdx actually
lea 64+32(%rsp),$tptr
mov %rdx,$bi
- xor $zero,$zero # of=0,cf=0
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
- adcx %rax,%r11
+ add %rax,%r11
mov $bptr,8(%rsp) # off-load &b[i]
mulx 2*8($aptr),%r12,%r13 # ...
- adcx %r14,%r12
- adcx $zero,%r13
+ adc %r14,%r12
+ adc \$0,%r13
mov $mi,$bptr # borrow $bptr
imulq 24(%rsp),$mi # "t[0]"*n0
mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
- mulx 2*8($nptr),%rax,%r12
+ .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
mov 48(%rsp),$bptr # counter value
mov %r10,-4*8($tptr)
adcx %rax,%r11
adox %r13,%r12
mulx 3*8($nptr),%rax,%r15
- .byte 0x66,0x66
mov $bi,%rdx
mov %r11,-3*8($tptr)
adcx %rax,%r12
lea 4*8($nptr),$nptr
mov %r12,-2*8($tptr)
- #jmp .Lmulx4x_1st
+ jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
adox %r12,%r11
mulx 2*8($nptr),%rax,%r12
mov %r10,-4*8($tptr)
- mov 0*8($tptr),%r10
adcx %rax,%r11
adox %r13,%r12
mulx 3*8($nptr),%rax,%r15
adcx %rax,%r12
adox $zero,%r15 # of=0
mov 48(%rsp),$bptr # counter value
- .byte 0x66,0x3e
mov %r12,-2*8($tptr)
+ .byte 0x66
lea 4*8($nptr),$nptr
- jmp .Lmulx4x_inner
+ #jmp .Lmulx4x_inner
.align 32
.Lmulx4x_inner:
adcx $zero,%r15 # cf=0, modulo-scheduled
- adox %r10,%r14
+ adox 0*8($tptr),%r14
mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
- mov 1*8($tptr),%r13
adcx %r14,%r10
mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
adox %rax,%r11
mulx 2*8($aptr),%r12,%rax # ...
- adcx %r13,%r11
+ adcx 1*8($tptr),%r11
adox %r14,%r12
mulx 3*8($aptr),%r13,%r14
mov $mi,%rdx
adox %rax,%r13
adcx 3*8($tptr),%r13
adox $zero,%r14 # of=0
- .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
- .byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr
+ lea 4*8($aptr),$aptr
+ lea 4*8($tptr),$tptr
adcx $zero,%r14 # cf=0
adox %r15,%r10
adox %r15,%r12
mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
- mov 0*8($tptr),%r10
adcx %rax,%r12
adox %r15,%r13
mulx 3*8($nptr),%rax,%r15
mov 0(%rsp),$num # load num
mov 8(%rsp),$bptr # re-load &b[i]
adc $zero,%r15 # modulo-scheduled
- sub %r10,$zero # pull top-most carry
+ sub 0*8($tptr),$zero # pull top-most carry
adc %r15,%r14
sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr)
jne .Lmulx4x_outer
neg $num
+ xor %rdx,%rdx
mov 32(%rsp),$rptr # restore rp
lea 64(%rsp),$tptr
- xor %rdx,%rdx
pxor %xmm0,%xmm0
mov 0*8($nptr,$num),%r8
mov 1*8($nptr,$num),%r9
push %r15
shl \$3,${num}d # convert $num to bytes
+ .byte 0x67
xor %r10,%r10
mov %rsp,%r11 # put aside %rsp
sub $num,%r10 # -$num
movq %r10, %xmm3 # -$num
movq %r11, %xmm4 # save original %rsp
mov $n0, 32(%rsp)
+___
+$code.=<<___ if ($win64);
+ jmp .Lsqrx8x_body
+.align 32
+___
+$code.=<<___;
.Lsqrx8x_body:
##################################################################
# Squaring part:
mov $aaptr,8(%rsp) # save end of $aptr
jmp .Lsqr8x_zero_start
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
.Lsqrx8x_zero:
+ .byte 0x3e
movdqa %xmm0,0*8($tptr)
movdqa %xmm0,2*8($tptr)
movdqa %xmm0,4*8($tptr)
movdqa %xmm0,6*8($tptr)
-.Lsqr8x_zero_start:
+.Lsqr8x_zero_start: # aligned at 32
movdqa %xmm0,8*8($tptr)
movdqa %xmm0,10*8($tptr)
movdqa %xmm0,12*8($tptr)
jnz .Lsqrx8x_zero
mov 0*8($aptr),%rdx # a[0], modulo-scheduled
- xor %r8,%r8
- xor %r9,%r9
+ #xor %r9,%r9 # t[1], ex-$num, zero already
xor %r10,%r10
xor %r11,%r11
xor %r12,%r12
xor %r13,%r13
xor %r14,%r14
+ xor %r15,%r15
lea 48(%rsp),$tptr
xor $zero,$zero # cf=0, cf=0
jmp .Lsqrx8x_outer_loop
.align 32
.Lsqrx8x_outer_loop:
- mulx 1*8($aptr),%rax,%rbx # a[1]*a[0]
- adcx %rax,%r8 # a[1]*a[0]+=t[1]
- adox %rbx,%r9
- mulx 2*8($aptr),%rax,%rbx # a[2]*a[0]
- adcx %rax,%r9
- adox %rbx,%r10
- .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%rax,%rbx # ...
- adcx %rax,%r10
- adox %rbx,%r11
- .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%rax,%rbx
- adcx %rax,%r11
- adox %rbx,%r12
- mulx 5*8($aptr),%rax,%rbx
- adcx %rax,%r12
- adox %rbx,%r13
- mulx 6*8($aptr),%rax,%rbx
- adcx %rax,%r13
- adox %rbx,%r14
- mulx 7*8($aptr),%rax,%r15
+ mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
+ adcx %r9,%r8 # a[1]*a[0]+=t[1]
+ adox %rax,%r10
+ mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
+ adcx %r10,%r9
+ adox %rax,%r11
+ .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
+ adcx %r11,%r10
+ adox %rax,%r12
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
+ adcx %r12,%r11
+ adox %rax,%r13
+ mulx 5*8($aptr),%r12,%rax
+ adcx %r13,%r12
+ adox %rax,%r14
+ mulx 6*8($aptr),%r13,%rax
+ adcx %r14,%r13
+ adox %r15,%rax
+ mulx 7*8($aptr),%r14,%r15
mov 1*8($aptr),%rdx # a[1]
adcx %rax,%r14
adox $zero,%r15
adc 8*8($tptr),%r15
+ mov %r8,1*8($tptr) # t[1]
+ mov %r9,2*8($tptr) # t[2]
sbb $carry,$carry # mov %cf,$carry
xor $zero,$zero # cf=0, of=0
- mov %r8,1*8($tptr) # t[1]
- mov %r9,2*8($tptr) # t[2]
mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
adcx %rbx,%r11
adox %rax,%r12
adcx %r14,%r12
- adox $zero,%r13 # of=0
- adcx $zero,%r13 # cf=0
-
mov %r8,5*8($tptr) # t[5]
mov %r9,6*8($tptr) # t[6]
+ mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
+ adox $zero,%r13 # of=0
+ adcx $zero,%r13 # cf=0
- mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
adcx %r10,%r8
adox %rax,%r9
adcx %r14,%r11
adox %rbx,%r12
adcx %rax,%r12
- .byte 0x66,0x66
adox $zero,%r13
+ .byte 0x67,0x67
mulx %r8,%r8,%r14 # a[7]*a[6]
adcx %r8,%r13
adcx $zero,%r14
je .Lsqrx8x_outer_break
neg $carry # mov $carry,%cf
+ mov \$-8,%rcx
mov $zero,%r15
mov 8*8($tptr),%r8
- adc 9*8($tptr),%r9 # +=t[9]
- adc 10*8($tptr),%r10 # ...
- adc 11*8($tptr),%r11
+ adcx 9*8($tptr),%r9 # +=t[9]
+ adcx 10*8($tptr),%r10 # ...
+ adcx 11*8($tptr),%r11
adc 12*8($tptr),%r12
adc 13*8($tptr),%r13
adc 14*8($tptr),%r14
adc 15*8($tptr),%r15
- lea 8*8($tptr),$tptr
- sbb $carry,$carry # mov %cf,$carry
+ lea ($aptr),$aaptr
+ lea 2*8*8($tptr),$tptr
+ sbb %rax,%rax # mov %cf,$carry
mov -64($aptr),%rdx # a[0]
- lea ($aptr),$aaptr
- mov $carry,16(%rsp) # offload $carry
+ mov %rax,16(%rsp) # offload $carry
mov $tptr,24(%rsp)
- lea 8*8($tptr),$tptr
+ #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
xor %eax,%eax # cf=0, of=0
- mov \$-8,%rcx
jmp .Lsqrx8x_loop
.align 32
adox %rbx,%r15 # %rbx is 0, of=0
adcx %rbx,%r15 # cf=0
+ .byte 0x67
inc %rcx # of=0
jnz .Lsqrx8x_loop
lea 8*8($aaptr),$aaptr
+ mov \$-8,%rcx
cmp 8(%rsp),$aaptr # done?
je .Lsqrx8x_break
sub 16(%rsp),%rbx # mov 16(%rsp),%cf
+ .byte 0x66
mov -64($aptr),%rdx
- adc 0*8($tptr),%r8
- adc 1*8($tptr),%r9
+ adcx 0*8($tptr),%r8
+ adcx 1*8($tptr),%r9
adc 2*8($tptr),%r10
adc 3*8($tptr),%r11
adc 4*8($tptr),%r12
adc 6*8($tptr),%r14
adc 7*8($tptr),%r15
lea 8*8($tptr),$tptr
- sbb %rbx,%rbx # mov %cf,%rbx
- xor %eax,%eax # cf=0, of=0
- mov %rbx,16(%rsp) # offload carry
- mov \$-8,%rcx
+ .byte 0x67
+ sbb %rax,%rax # mov %cf,%rax
+ xor %ebx,%ebx # cf=0, of=0
+ mov %rax,16(%rsp) # offload carry
jmp .Lsqrx8x_loop
.align 32
.Lsqrx8x_break:
sub 16(%rsp),%r8 # consume last carry
- mov 24(%rsp),$aaptr # initial $tptr
+ mov 24(%rsp),$carry # initial $tptr, borrow $carry
mov 0*8($aptr),%rdx # a[8], modulo-scheduled
+ xor %ebp,%ebp # xor $zero,$zero
mov %r8,0*8($tptr)
- lea 8*8($aaptr),$aaptr
+ cmp $carry,$tptr # cf=0, of=0
+ je .Lsqrx8x_outer_loop
+
mov %r9,1*8($tptr)
- mov 1*8($aaptr),%r8 # potentially forwarded store
+ mov 1*8($carry),%r9
mov %r10,2*8($tptr)
- mov 2*8($aaptr),%r9 # ...
+ mov 2*8($carry),%r10
mov %r11,3*8($tptr)
- mov 3*8($aaptr),%r10
+ mov 3*8($carry),%r11
mov %r12,4*8($tptr)
- mov 4*8($aaptr),%r11
+ mov 4*8($carry),%r12
mov %r13,5*8($tptr)
- mov 5*8($aaptr),%r12
+ mov 5*8($carry),%r13
mov %r14,6*8($tptr)
- mov 6*8($aaptr),%r13
+ mov 6*8($carry),%r14
mov %r15,7*8($tptr)
- mov 7*8($aaptr),%r14
- mov $aaptr,$tptr
- xor $zero,$zero # cf=0, cf=0
+ mov 7*8($carry),%r15
+ mov $carry,$tptr
jmp .Lsqrx8x_outer_loop
.align 32
}\f{
my $i="%rcx";
$code.=<<___;
- mov (%rsp),$num # restore $num
-
lea 48(%rsp),$tptr
mov ($aptr,$i),%rdx # a[0]
mov 8($tptr),$A0[1] # t[1]
xor $A0[0],$A0[0] # t[0], of=0, cf=0
+ mov (%rsp),$num # restore $num
adox $A0[1],$A0[1]
mov 16($tptr),$A1[0] # t[2] # prefetch
mov 24($tptr),$A1[1] # t[3] # prefetch
.align 32
.Lsqrx4x_shift_n_add_break:
adcx $A1[1],%rbx
- .byte 0x48,0x89,0x87,0x30,0x00,0x00,0x00 # mov %rax,48($tptr)
- .byte 0x48,0x89,0x9f,0x38,0x00,0x00,0x00 # mov %rbx,56($tptr)
- .byte 0x48,0x8d,0xbf,0x40,0x00,0x00,0x00 # lea 64($tptr),$tptr
+ mov %rax,48($tptr)
+ mov %rbx,56($tptr)
+ lea 64($tptr),$tptr # end of t[] buffer
___
}\f
######################################################################
$code.=<<___;
movq %xmm2,$nptr
+ xor %eax,%eax # initial top-most carry bit
mov 32(%rsp),%rbx # n0
mov 48(%rsp),%rdx # "%r8", 8*0($tptr)
- lea ($nptr,$num),%rax # end of n[]
+ lea -64($nptr,$num),%rcx # end of n[]
#lea 48(%rsp,$num,2),$tptr # end of t[] buffer
- mov %rax, 0(%rsp) # save end of n[]
+ mov %rcx, 0(%rsp) # save end of n[]
mov $tptr,8(%rsp) # save end of t[]
lea 48(%rsp),$tptr # initial t[] window
- xor %rax,%rax
- nop
- #jmp .Lsqrx8x_reduction_loop
+ jmp .Lsqrx8x_reduction_loop
.align 32
.Lsqrx8x_reduction_loop:
adox $carry,%r15 # $carry is 0
adcx $carry,%r15 # cf=0
+ .byte 0x67
inc %rcx # of=0
jnz .Lsqrx8x_reduce
- lea 8*8($nptr),$nptr
- xor %rax,%rax
+ .byte 0x66,0x67
+ mov $carry,%rax # xor %rax,%rax
cmp 0(%rsp),$nptr # end of n[]?
jae .Lsqrx8x_no_tail
mov 48(%rsp),%rdx # pull n0*a[0]
add 8*0($tptr),%r8
- adcx 8*1($tptr),%r9
- adcx 8*2($tptr),%r10
- adcx 8*3($tptr),%r11
- adcx 8*4($tptr),%r12
- adcx 8*5($tptr),%r13
- adcx 8*6($tptr),%r14
- adcx 8*7($tptr),%r15
+ lea 8*8($nptr),$nptr
+ mov \$-8,%rcx
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
lea 8*8($tptr),$tptr
- sbb $carry,$carry # top carry
+ sbb %rax,%rax # top carry
- mov \$-8,%rcx
- mov $carry,16(%rsp)
xor $carry,$carry # of=0, cf=0
+ mov %rax,16(%rsp)
jmp .Lsqrx8x_tail
.align 32
mulx 8*7($nptr),%rax,%r15
mov 48+72(%rsp,%rcx,8),%rdx # pull n0*a[i]
adcx %rax,%r14
- .byte 0x66
+ .byte 0x67
adox $carry,%r15
mov %rbx,($tptr,%rcx,8) # save result
mov %r8,%rbx
inc %rcx # of=0
jnz .Lsqrx8x_tail
- lea 8*8($nptr),$nptr
cmp 0(%rsp),$nptr # end of n[]?
jae .Lsqrx8x_tail_done # break out of loop
- sub 16(%rsp),$carry # neg $carry
+ sub 16(%rsp),$carry # mov 16(%rsp),%cf
mov 48(%rsp),%rdx # pull n0*a[0]
- adcx 8*0($tptr),%r8
- adcx 8*1($tptr),%r9
- adcx 8*2($tptr),%r10
- adcx 8*3($tptr),%r11
- adcx 8*4($tptr),%r12
- adcx 8*5($tptr),%r13
- adcx 8*6($tptr),%r14
- adcx 8*7($tptr),%r15
+ lea 8*8($nptr),$nptr
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
lea 8*8($tptr),$tptr
- sbb $carry,$carry
-
mov \$-8,%rcx
- mov $carry,16(%rsp)
+ sbb %rax,%rax
+
xor $carry,$carry # of=0, cf=0
+ mov %rax,16(%rsp)
jmp .Lsqrx8x_tail
.align 32
.Lsqrx8x_tail_done:
add 24(%rsp),%r8 # can this overflow?
- xor %rax,%rax
+ mov $carry,%rax # xor %rax,%rax
- sub 16(%rsp),$carry # neg $carry
-.Lsqrx8x_no_tail: # carry flag is 0
+ sub 16(%rsp),$carry # mov 16(%rsp),%cf
+.Lsqrx8x_no_tail: # %cf is 0 if jumped here
adc 8*0($tptr),%r8
movq %xmm3,%rcx
adc 8*1($tptr),%r9
adc 8*7($tptr),%r15
adc %rax,%rax # top-most carry
- cmp 8(%rsp),$carry # end of t[]?
mov 32(%rsp),%rbx # n0
mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
- lea 8*8($tptr,%rcx),$tptr # start of current t[] window
- mov %r8,-8*8($carry) # store top 512 bits
- mov %r9,-8*7($carry)
- mov %r10,-8*6($carry)
- mov %r11,-8*5($carry)
- mov %r12,-8*4($carry)
- mov %r13,-8*3($carry)
- mov %r14,-8*2($carry)
- mov %r15,-8*1($carry)
+ mov %r8,8*0($tptr) # store top 512 bits
+ mov %r9,8*1($tptr)
+ mov %r10,8*2($tptr)
+ mov %r11,8*3($tptr)
+ mov %r12,8*4($tptr)
+ mov %r13,8*5($tptr)
+ mov %r14,8*6($tptr)
+ mov %r15,8*7($tptr)
+ lea 8*8($tptr,%rcx),$tptr # start of current t[] window
+ cmp 8(%rsp),$carry # end of t[]?
jb .Lsqrx8x_reduction_loop
- mov %rcx,$num
- neg $num # restore $num
+ mov %rcx,%rdx # -$num
+ jmp .Lsqrx8x_post
___
}\f
##############################################################
my @ri=map("%r$_",(10..13));
my @ni=map("%r$_",(14..15));
$code.=<<___;
- lea ($nptr,$num),$nptr # end of $nptr
- lea 48(%rsp,$num),$lptr # end of lower half of t[2*num]
- lea 48(%rsp,$num),$tptr
+.align 32
+.Lsqrx8x_post:
+ neg %rdx # restore $num
neg %rax # top-most carry as mask
+ mov 0*8($nptr),%r8
+ mov 1*8($nptr),%r9
+ lea ($nptr,%rdx),$nptr # end of $nptr
+ lea 48(%rsp,%rdx),$lptr # end of lower half of t[2*num]
+ lea 48(%rsp,%rdx),$tptr
+ .byte 0x67
xor %rdx,%rdx
movq %xmm1,$rptr # restore $rptr
- mov 0*8($nptr,$i),%r8
- mov 1*8($nptr,$i),%r9
neg %r8
jmp .Lsqrx8x_sub_entry
-.align 32
+.byte 0x66,0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
.Lsqrx8x_sub:
mov 0*8($nptr,$i),%r8
mov 1*8($nptr,$i),%r9
not %r8
-.Lsqrx8x_sub_entry:
+.Lsqrx8x_sub_entry: # aligned at 32
mov 2*8($nptr,$i),%r10
not %r9
and %rax,%r8
movdqa %xmm0,2*8($lptr,$i)
and %rax,%r15
- neg %rdx # mov %rdx,%cf
+ neg %edx # mov %edx,%cf
movdqa %xmm0,4*8($lptr,$i)
adc 0*8($tptr),%r8
+ mov %r8,0*8($rptr) # result
adc 1*8($tptr),%r9
movdqa %xmm0,6*8($lptr,$i)
adc 2*8($tptr),%r10
+ mov %r9,1*8($rptr)
adc 3*8($tptr),%r11
movdqa %xmm0,0*8($tptr) # zap upper half
adc 4*8($tptr),%r12
+ mov %r10,2*8($rptr)
adc 5*8($tptr),%r13
movdqa %xmm0,2*8($tptr)
adc 6*8($tptr),%r14
+ mov %r11,3*8($rptr)
adc 7*8($tptr),%r15
+ sbb %edx,%edx # mov %cf,%edx
movdqa %xmm0,4*8($tptr)
- sbb %rdx,%rdx # mov %cf,%rdx
movdqa %xmm0,6*8($tptr)
lea 8*8($tptr),$tptr
-
- mov %r8,0*8($rptr)
- mov %r9,1*8($rptr)
- mov %r10,2*8($rptr)
- mov %r11,3*8($rptr)
mov %r12,4*8($rptr)
mov %r13,5*8($rptr)
mov %r14,6*8($rptr)