stq $hi1,16($tp)
bne $tj,.Louter
\f
- s8addq $num,sp,$ap
- mov $rp,$bp
+ s8addq $num,sp,$tj # &tp[num]
+ mov $rp,$bp # put rp aside
mov sp,$tp
- mov 0,$hi0
-
- bne $hi1,.Lsub
- cmpult $nj,$lo1,AT
- bne AT,.Lsub
-
-.align 4
-.Lcopy: ldq AT,($tp)
- lda $tp,8($tp)
- stq AT,($rp)
- cmpult $tp,$ap,AT
- stq zero,-8($tp)
- nop
- lda $rp,8($rp)
- bne AT,.Lcopy
- mov 1,v0
- br .Lexit
+ mov sp,$ap
+ srl $nj,62,AT # boundary condition...
+ beq AT,.Lcopy # ... is met
+ mov 0,$hi0 # clear borrow bit
.align 4
.Lsub: ldq $lo0,($tp)
ldq $lo1,($np)
- subq $lo0,$lo1,$lo1
+ lda $tp,8($tp)
+ lda $np,8($np)
+ subq $lo0,$lo1,$lo1 # tp[i]-np[i]
cmpult $lo0,$lo1,AT
subq $lo1,$hi0,$lo0
cmpult $lo1,$lo0,$hi0
- lda $tp,8($tp)
or $hi0,AT,$hi0
- lda $np,8($np)
stq $lo0,($rp)
- cmpult $tp,$ap,v0
+ cmpult $tp,$tj,v0
lda $rp,8($rp)
bne v0,.Lsub
- subq $hi1,$hi0,$hi0
+ subq $hi1,$hi0,$hi0 # handle upmost overflow bit
mov sp,$tp
- cmpule $hi1,$hi0,AT
- mov $bp,$rp
- bne AT,.Lcopy
+ mov $bp,$rp # restore rp
+
+ and sp,$hi0,$ap
+ bic $bp,$hi0,$bp
+ bis $bp,$ap,$ap # ap=borrow?tp:rp
.align 4
-.Lzap: stq zero,($tp)
- cmpult $tp,$ap,AT
+.Lcopy: ldq $aj,($ap) # copy or in-place refresh
lda $tp,8($tp)
- bne AT,.Lzap
+ lda $rp,8($rp)
+ lda $ap,8($ap)
+ stq zero,-8($tp) # zap tp
+ cmpult $tp,$tj,AT
+ stq $aj,-8($rp)
+ bne AT,.Lcopy
mov 1,v0
-.align 4
.Lexit:
.set noreorder
mov fp,sp
cmp $num,#2
movlt r0,#0
addlt sp,sp,#2*4
- blt .Labort
+ blt .Labrt
stmdb sp!,{r4-r12,lr} @ save 10 registers
add $num,$num,#4 @ $num to point at &tp[num]
sub $aj,$num,sp @ "original" num value
mov $tp,sp @ "rewind" $tp
+ mov $ap,$tp @ "borrow" $ap
sub $np,$np,$aj @ "rewind" $np to &np[0]
- cmp $nhi,#0 @ upmost carry
- bne .Lsub
- cmp $nlo,$nj @ tp[num-1]-np[num-1]
- bhs .Lsub
-
-.Lcopy: ldr $tj,[$tp]
- str sp,[$tp],#4 @ zap tp
- str $tj,[$rp],#4
- cmp $tp,$num
- bne .Lcopy
-
-.Lexit: add sp,$num,#4 @ skip over tp[num+1]
- ldmia sp!,{r4-r12,lr} @ restore registers
- add sp,sp,#2*4 @ skip over {r0,r2}
- mov r0,#1
-.Labort:tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
+ movs $tj,$nj,lsr#30 @ boundary condition...
+ beq .Lcopy @ ... is met
+ subs $tj,$tj,$tj @ "clear" carry flag
.Lsub: ldr $tj,[$tp],#4
ldr $nj,[$np],#4
sbcs $tj,$tj,$nj @ tp[j]-np[j]
sbcs $nhi,$nhi,#0 @ upmost carry
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
- blo .Lcopy @ tp was less after all
-.Lzap: str sp,[$tp],#4
+ and $ap,$tp,$nhi
+ bic $np,$rp,$nhi
+ orr $ap,$ap,$np @ ap=borrow?tp:rp
+
+.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
+ str sp,[$tp],#4 @ zap tp
+ str $tj,[$rp],#4
cmp $tp,$num
- bne .Lzap
- bal .Lexit
+ bne .Lcopy
+
+ add sp,$num,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4-r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+.Labrt: tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+ bx lr @ interoperable with Thumb ISA:-)
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___
addu $i,8
sltu s7,$i,$num
bnez s7,.Louter
-
+\f
.set noreorder
- PTR_ADD $ap,sp,$num
+ PTR_ADD $tj,sp,$num # &tp[num]
move $tp,sp
+ move $ap,sp
- bnez $hi1,.Lsub
- li $hi0,0
- sgeu AT,$lo1,$nj
- beqz AT,.Lsub
- nop
+ dsrl AT,$nj,62 # boundary condition...
+ beqz AT,.Lcopy # ... is met
+ li $hi0,0 # clear borrow bit
.align 4
-.Lcopy: ld AT,($tp)
+.Lsub: ld $lo0,($tp)
+ ld $lo1,($np)
+ PTR_ADD $tp,8
+ PTR_ADD $np,8
+ dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
+ sgtu AT,$lo1,$lo0
+ dsubu $lo0,$lo1,$hi0
+ sgtu $hi0,$lo0,$lo1
+ sd $lo0,($rp)
+ or $hi0,AT
+ sltu AT,$tp,$tj
+ bnez AT,.Lsub
+ PTR_ADD $rp,8
+
+ dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
+ move $tp,sp
+ PTR_SUB $rp,$num # restore rp
+ not $hi1,$hi0
+
+ and $ap,$hi0,sp
+ and $bp,$hi1,$rp
+ or $ap,$ap,$bp # ap=borrow?tp:rp
+
+.align 4
+.Lcopy: ld $aj,($ap)
+ PTR_ADD $ap,8
PTR_ADD $tp,8
- sd AT,($rp)
- sltu AT,$tp,$ap
sd zero,-8($tp)
+ sltu AT,$tp,$tj
+ sd $aj,($rp)
bnez AT,.Lcopy
PTR_ADD $rp,8
-.Lexit:
ld s0,0($fp)
ld s1,8($fp)
ld s2,16($fp)
li v0,1
jr ra
PTR_ADD sp,$fp,64
-
-.align 4
-.Lsub: ld $lo0,($tp)
- ld $lo1,($np)
- dsubu $lo1,$lo0,$lo1
- sgtu AT,$lo1,$lo0
- dsubu $lo0,$lo1,$hi0
- sgtu $hi0,$lo0,$lo1
- PTR_ADD $tp,8
- or $hi0,AT
- PTR_ADD $np,8
- sd $lo0,($rp)
- sltu AT,$tp,$ap
- bnez AT,.Lsub
- PTR_ADD $rp,8
-
- dsubu $hi0,$hi1,$hi0
- move $tp,sp
- sgtu AT,$hi0,$hi1
- bnez AT,.Lcopy
- PTR_SUB $rp,$num
-.align 4
-.Lzap: sd zero,($tp)
- sltu AT,$tp,$ap
- bnez AT,.Lzap
- PTR_ADD $tp,8
- b .Lexit
- nop
.set reorder
END(bn_mul_mont)
.rdata
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
+ $SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($output =~ /64\-mont\.s/) {
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
+ $SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $output"; }
addi $i,$i,$BNSZ
ble- Louter
\f
+ $SHRI. $nj,$nj,$BITS-2 ; check boundary condition
addi $num,$num,2 ; restore $num
+ subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
+ addi $ap,$sp,$FRAME
mtctr $num
+ beq Lcopy ; boundary condition is met
+
+.align 4
+Lsub: $LDX $tj,$tp,$j
+ $LDX $nj,$np,$j
+ subfe $aj,$nj,$tj ; tp[j]-np[j]
+ $STX $aj,$rp,$j
+ addi $j,$j,$BNSZ
+ bdnz- Lsub
+
li $j,0
+ mtctr $num
+ subfe $ovf,$j,$ovf ; handle upmost overflow bit
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
- subfc. $ovf,$j,$ovf ; sets XER[CA]
- bne Lsub
- $UCMP $hi1,$nj
- bge Lsub
.align 4
-Lcopy:
- $LDX $tj,$tp,$j
+Lcopy: ; copy or in-place refresh
+ $LDX $tj,$ap,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
-Lexit:
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
li r3,1
blr
.long 0
-.align 4
-Lsub: $LDX $tj,$tp,$j
- $LDX $nj,$np,$j
- subfe $tj,$nj,$tj ; tp[j]-np[j]
- $STX $tj,$rp,$j
- addi $j,$j,$BNSZ
- bdnz- Lsub
- li $j,0
- subfe. $ovf,$j,$ovf
- mtctr $num
- bne Lcopy
-.align 4
-Lzap: $STX $j,$tp,$j
- addi $j,$j,$BNSZ
- bdnz- Lzap
- b Lexit
+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
___
undef $bi;
-$count=$ap; undef $ap;
+$count=$bp; undef $bp;
$code.=<<___;
lg $rp,16+16($fp) # reincarnate rp
+ la $ap,8($fp)
lgr $j,$num
- ltgr $AHI,$AHI
- jnz .Lsub # upmost overflow bit is not zero
- #slg $NHI,-8($np) # tp[num-1]-np[num-1]
- lghi $count,-8 # buggy assembler
- slg $NHI,0($count,$np) # buggy assembler
- jnle .Lsub # branch if not borrow
-.Lcopy: lg $alo,8($j,$fp)
- stg $j,8($j,$fp)
- stg $alo,0($j,$rp)
- aghi $j,8
- jnz .Lcopy
-.Lexit:
- lmg %r6,%r15,16+48($fp)
- lghi %r2,1 # signal "processed"
- br %r14
+ #lg $nhi,-8($np) # buggy assembler
+ lghi $count,-8 # buggy assembler
+ lg $nhi,0($count,$np) # buggy assembler
+ srag $nhi,$nhi,62 # boundary condition...
+ jz .Lcopy # ... is met
-.Lsub: lcgr $count,$num
+ lcgr $count,$num
sra $count,3 # incidentally clears "borrow"
-.Lsubloop:
- lg $alo,8($j,$fp)
+.Lsub: lg $alo,0($j,$ap)
slbg $alo,0($j,$np)
stg $alo,0($j,$rp)
la $j,8($j)
- brct $count,.Lsubloop
+ brct $count,.Lsub
lghi $ahi,0
- slbgr $AHI,$ahi
+ slbgr $AHI,$ahi # handle upmost carry
+
+ ngr $ap,$AHI
+ lghi $np,-1
+ xgr $np,$AHI
+ ngr $np,$rp
+ ogr $ap,$np # ap=borrow?tp:rp
lgr $j,$num
- jle .Lcopy # branch if borrow
-.Lzap: stg $j,8($j,$fp)
+.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
+ stg $j,8($j,$fp) # zap tp
+ stg $alo,0($j,$rp)
aghi $j,8
- jnz .Lzap
- j .Lexit
+ jnz .Lcopy
+
+ lmg %r6,%r15,16+48($fp)
+ lghi %r2,1 # signal "processed"
+ br %r14
.size bn_mul_mont,.-bn_mul_mont
.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
___
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# December 2005
.Ltail:
add $np,$num,$np
add $rp,$num,$rp
-
- cmp $car2,0 ! clears %icc.c
- bne,pn %icc,.Lsub
+ mov $tp,$ap
sub %g0,$num,%o7 ! k=-num
- cmp $car1,$npj ! compare top-most $tp and $np words
- bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
- nop
+ srl $npj,30,%o0 ! boundary condition...
+ brz,pn %o0,.Lcopy ! ... is met
+ subcc %g0,%g0,%g0 ! clear %icc.c
.align 16,0x1000000
.Lsub:
ld [$tp+%o7],%o0
ld [$np+%o7],%o1
- subccc %o0,%o1,%o1
+ subccc %o0,%o1,%o1 ! tp[j]-np[j]
st %o1,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lsub
nop
- subccc $car2,0,$car2
- bcc %icc,.Lzap
+ subc $car2,0,$car2 ! handle upmost overflow bit
+ and $tp,$car2,$ap
+ andn $rp,$car2,$np
+ or $ap,$np,$ap
sub %g0,$num,%o7
.align 16,0x1000000
.Lcopy:
- ld [$tp+%o7],%o0
+ ld [$ap+%o7],%o0 ! copy or in-place refresh
+ st %g0,[$tp+%o7] ! zap tp
st %o0,[$rp+%o7]
add %o7,4,%o7
brnz %o7,.Lcopy
nop
- ba .Lzap
- sub %g0,$num,%o7
-
-.align 32
-.Lzap:
- st %g0,[$tp+%o7]
- add %o7,4,%o7
- brnz %o7,.Lzap
- nop
mov 1,%i0
ret
restore
add $tp,8,$tp
.type $fname,#function
.size $fname,(.-$fname)
+.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
$ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
$code=<<___;
-.ident "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
.section ".text",#alloc,#execinstr
.global $fname
bnz %icc,.Louter
nop
\f
- sub %g0,$num,%o7 ! n=-num
- cmp $carry,0 ! clears %icc.c
- bne,pn %icc,.Lsub
- add $tp,8,$tp ! adjust tp to point at the end
-
- ld [$tp-8],%o0
ld [$np-4],%o1
- cmp %o0,%o1 ! compare topmost words
- bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
- nop
-
+ subcc %g0,%g0,%g0 ! clear %icc.c
+ add $tp,8,$tp ! adjust tp to point at the end
+ srl %o1,30,%o1 ! boundary condition...
+ orn %g0,%g0,%g4
+ brz,pn %o1,.Lcopy ! ... is met
+ sub %g0,$num,%o7 ! n=-num
+
.align 32,0x1000000
.Lsub:
ldx [$tp+%o7],%o0
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
- subccc $carry,0,$carry
- bcc,pt %icc,.Lzap
+ subc $carry,0,%g4
sub %g0,$num,%o7 ! n=-num
-.align 16,0x1000000
+.align 32,0x1000000
.Lcopy:
ldx [$tp+%o7],%o0
- srlx %o0,32,%o1
add $rp,%o7,%g1
+ ld [%g1+0],%o2
+ ld [%g1+4],%o3
+ stx %g0,[$tp+%o7]
+ and %o0,%g4,%o0
+ srlx %o0,32,%o1
+ andn %o2,%g4,%o2
+ andn %o3,%g4,%o3
+ or %o2,%o0,%o0
+ or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,$num,%o7 ! n=-num
-.align 32
+.align 32,0x1000000
.Lzap:
- stx %g0,[$tp+%o7]
stx %g0,[$ap_l+%o7]
stx %g0,[$ap_h+%o7]
stx %g0,[$np_l+%o7]
# - in terms of absolute performance it delivers approximately as much
# as modern out-of-order 32-bit cores [again, for longer keys].
-push(@INC,".","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
&asm_init($ARGV[0],"via-mont.pl");
# &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num]
# &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num]
# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num]
-# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of np[num]
+# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num]
# Note that SDK suggests to unconditionally allocate 2K per vector. This
# has quite an impact on performance. It naturally depends on key length,
# but to give an example 1024 bit private RSA key operations suffer >30%
&jnz (&label("leave")); # num % 4 != 0
&cmp ("ecx",8);
&jb (&label("leave")); # num < 8
- &cmp ("ecx",256);
+ &cmp ("ecx",1024);
&ja (&label("leave")); # num > 1024
&pushf ();
&lea ("ebp",&DWP(-$pad,"ecx"));
&shr ("ebp",2); # restore original num value in ebp
- &add ("ecx",32/4); # (4 vectors + 32 byte scratch)/4
&xor ("eax","eax");
+
+ &mov ("ecx","ebp");
+ &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("ecx","ebp");
&lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
&mov ($A,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
+ &mov ("ecx",$pad/4);
+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
+ # edi points at the end of padded ap copy...
- # edi points at the end of ap copy...
&mov ("ecx","ebp");
- &add ("edi",$pad); # skip padding to point at bp copy
&mov ("esi","ebx");
&mov ($B,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
+ &mov ("ecx",$pad/4);
+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
+ # edi points at the end of padded bp copy...
- # edi points at the end of bp copy...
&mov ("ecx","ebp");
- &add ("edi",$pad); # skip padding to point at np copy
&mov ("esi","edx");
&mov ($M,"edi");
&data_byte(0xf3,0xa5); # rep movsl, memcpy
+ &mov ("ecx",$pad/4);
+ &data_byte(0xf3,0xab); # rep stosl, bzero pad
+ # edi points at the end of padded np copy...
# let magic happen...
&mov ("ecx","ebp");
&mov ("esi","esp");
- &xor ("eax","eax");
&shl ("ecx",5); # convert word counter to bit counter
&align (4);
&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
&mov ("ecx","ebp");
- &xor ("edx","edx"); # i=0
- &lea ("esi",&DWP(64,"esp")); # tp
- # edi still points at the end of np copy...
+ &xor ("edx","edx"); # i=0
+ &lea ("esi",&DWP(64,"esp")); # tp
+ # edi still points at the end of padded np copy...
+ &mov ("eax",&DWP(-4-$pad,"edi")); # np[num-1]
&neg ("ebp");
- &lea ("ebp",&DWP(0,"edi","ebp",4)); # so just "rewind"
- &mov ("edi",$rp); # restore rp
-
- &mov ("ebx",&DWP(0,"esi","ecx",4)); # upmost overflow bit
- &cmp ("ebx",0); # clears CF unconfitionally
- &jnz (&label("sub"));
- &mov ("eax",&DWP(-4,"esi","ecx",4));
- &cmp ("eax",&DWP(-4,"ebp","ecx",4)); # tp[num-1]-np[num-1]?
- &jae (&label("sub")); # if taken CF is cleared
-
-&set_label("copy",4);
- &mov ("ebx","ecx");
- &data_byte(0xf3,0xa5); # rep movsl
- &mov ("ecx","ebx");
- &jmp (&label("zap"));
-
-&set_label("sub",16);
+ &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind"
+ &mov ("edi",$rp); # restore rp
+
+ &shr ("eax",30); # boundary condition...
+ &jz (&label("copy")); # ... is met
+ &xor ("edx","edx"); # clear CF
+
+&set_label("sub",8);
&mov ("eax",&DWP(0,"esi","edx",4));
&sbb ("eax",&DWP(0,"ebp","edx",4));
&mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i]
&lea ("edx",&DWP(1,"edx")); # i++
- &dec ("ecx"); # doesn't affect CF!
- &jg (&label("sub"));
- &sbb ("ebx",0); # upmost overflow is still there
- &mov ("ecx","edx");
- &jc (&label("copy"));
+ &loop (&label("sub")); # doesn't affect CF!
+
+ &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
+ &sbb ("eax",0);
+ &and ("esi","eax");
+ ¬ ("eax");
+ &mov ("ebp","edi");
+ &and ("ebp","eax");
+ &or ("esi","ebp"); # tp=carry?tp:rp
+
+ &mov ("ecx","edx"); # num
+ &xor ("edx","edx"); # i=0
+
+&set_label("copy",8);
+ &mov ("eax",&DWP(0,"esi","edx",4));
+ &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
+ &mov (&DWP(0,"edi","edx",4),"eax");
+ &lea ("edx",&DWP(1,"edx")); # i++
+ &loop (&label("copy"));
-&set_label("zap",4);
&mov ("ebp",$sp);
&xor ("eax","eax");
- &lea ("ecx",&DWP(64/4+$pad,"","ecx",4));# size of frame divided by 4
- &mov ("edi","esp");
+
+ &mov ("ecx",64/4);
+ &mov ("edi","esp"); # zap frame including scratch area
+ &data_byte(0xf3,0xab); # rep stosl, bzero
+
+ # zap copies of ap, bp and np
+ &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+ &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2));
&data_byte(0xf3,0xab); # rep stosl, bzero
&mov ("esp","ebp");
&set_label("leave");
&function_end($func);
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
&asm_finish();
$i="edx";
$j="ecx";
-$ap="esi";
+$ap="esi"; $tp="esi"; # overlapping variables!!!
$rp="edi"; $bp="edi"; # overlapping variables!!!
$np="ebp";
$num="ebx";
}
\f
&set_label("common_tail",16);
- &mov ($np,$_np);
- &mov ("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit
+ &mov ($np,$_np); # load modulus pointer
&mov ($rp,$_rp); # load result pointer
- # [$ap and $bp are zapped]
- &xor ($i,$i); # i=0
+ &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped]
+ &mov ("eax",&DWP(0,$np,$num,4)); # np[num-1]
+ &shr ("eax",30); # check for boundary condition
+ &jz (&label("copy"));
+
+ &mov ("eax",&DWP(0,$tp)); # tp[0]
&mov ($j,$num); # j=num-1
- &cmp ("esi",0); # clears CF unconditionally
- &jnz (&label("sub"));
- &mov ("eax",&DWP($frame,"esp",$j,4));
- &cmp ("eax",&DWP(0,$np,$j,4)); # tp[num-1]-np[num-1]?
- &jae (&label("sub")); # if taken CF is cleared
-&set_label("copy",16);
- &mov ("eax",&DWP($frame,"esp",$j,4));
- &mov (&DWP(0,$rp,$j,4),"eax"); # rp[i]=tp[i]
- &mov (&DWP($frame,"esp",$j,4),$j); # zap temporary vector
- &dec ($j);
- &jge (&label("copy"));
- &jmp (&label("exit"));
+ &xor ($i,$i); # i=0 and clear CF!
&set_label("sub",16);
- &mov ("eax",&DWP($frame,"esp",$i,4));
&sbb ("eax",&DWP(0,$np,$i,4));
&mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i]
- &lea ($i,&DWP(1,$i)); # i++
&dec ($j); # doesn't affect CF!
+ &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1]
+ &lea ($i,&DWP(1,$i)); # i++
&jge (&label("sub"));
- &mov ($j,$num); # j=num-1
- &sbb ("esi",0); # esi holds upmost overflow bit
- &jc (&label("copy"));
-&set_label("zap",8);
- &mov (&DWP($frame,"esp",$j,4),$i); # zap temporary vector
- &dec ($j);
- &jge (&label("zap"));
-
-&set_label("exit",8);
+
+ &sbb ("eax",0); # handle upmost overflow bit
+ &and ($tp,"eax");
+ ¬ ("eax");
+ &mov ($np,$rp);
+ &and ($np,"eax");
+ &or ($tp,$np); # tp=carry?tp:rp
+
+&set_label("copy",16); # copy or in-place refresh
+ &mov ("eax",&DWP(0,$tp,$num,4));
+ &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
+ &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+ &dec ($num);
+ &jge (&label("copy"));
+
&mov ("esp",$_sp); # pull saved stack pointer
&mov ("eax",1);
&set_label("just_leave");
neg %rax
lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2))
and \$-1024,%rsp # minimize TLB usage
+
mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp
mov %rdx,$bp # $bp reassigned, remember?
cmp $num,$i
jl .Louter
- xor $i,$i # i=0
+ mov -8($np,$num,8),%rax # np[num-1]
+ lea (%rsp),$ap # borrow ap for tp
+ shr \$62,%rax # check for boundary condition
+ jz .Lcopy
+
+ mov ($ap),%rax # tp[0]
lea -1($num),$j # j=num-1
- cmp \$0,%rdx # %rdx still holds upmost overflow bit
- jnz .Lsub # CF is cleared by compare with 0
- mov (%rsp,$j,8),%rax
- cmp ($np,$j,8),%rax # tp[num-1]-np[num-1]
- jae .Lsub # if taken CF was cleared by above cmp
-.align 4
-.Lcopy:
- mov (%rsp,$j,8),%rax
+ xor $i,$i # i=0 and clear CF!
+ jmp .Lsub
+.align 16
+.Lsub: sbb ($np,$i,8),%rax
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
+ dec $j # doesn't affect CF!
+ mov 8($ap,$i,8),%rax # tp[i+1]
+ lea 1($i),$i # i++
+ jge .Lsub
+
+ sbb \$0,%rax # handle upmost overflow bit
+ and %rax,$ap
+ not %rax
+ mov $rp,$np
+ and %rax,$np
+ lea -1($num),$j
+ or $np,$ap # ap=borrow?tp:rp
+.align 16
+.Lcopy: # copy or in-place refresh
+ mov ($ap,$j,8),%rax
mov %rax,($rp,$j,8) # rp[i]=tp[i]
mov $i,(%rsp,$j,8) # zap temporary vector
dec $j
jge .Lcopy
-.align 4
-.Lexit:
+
mov 8(%rsp,$num,8),%rsp # restore %rsp
mov \$1,%rax
pop %r15
pop %rbp
pop %rbx
ret
-
-.align 16
-.Lsub: mov (%rsp,$i,8),%rax
- sbb ($np,$i,8),%rax
- mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j]
- lea 1($i),$i # i++
- dec $j # doesn't affect CF!
- jge .Lsub
- lea -1($num),$j # j=num-1
- sbb \$0,%rdx
- jc .Lcopy # tp was less than np
-.align 4
-.Lzap: mov $i,(%rsp,$j,8) # zap temporary vector
- dec $j
- jge .Lzap
- jmp .Lexit
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___