# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
# 2048-bit +18%
# 4096-bit +4%
-$output = shift;
+$flavour = shift;
-if ($output =~ /32\-mont\.s/) {
+if ($flavour =~ /32/) {
$BITS= 32;
$BNSZ= $BITS/8;
$SIZE_T=4;
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
+ $SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
-} elsif ($output =~ /64\-mont\.s/) {
+} elsif ($flavour =~ /64/) {
$BITS= 64;
$BNSZ= $BITS/8;
$SIZE_T=8;
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
+ $SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
-} else { die "nonsense $output"; }
+} else { die "nonsense $flavour"; }
-( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
- die "can't call ../perlasm/ppc-xlate.pl: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$sp="r1";
$toc="r2";
$nhi="r0";
$code=<<___;
+.machine "any"
.text
-.globl .bn_mul_mont
+.globl .bn_mul_mont_int
.align 4
-.bn_mul_mont:
+.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+ cmpwi $num,32 ; longer key performance is not better
+ bgelr
+___
+$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,`$FRAME+$RZONE`
.align 4
L1st:
$LDX $aj,$ap,$j ; ap[j]
- $LDX $nj,$np,$j ; np[j]
addc $lo0,$alo,$hi0
+ $LDX $nj,$np,$j ; np[j]
addze $hi0,$ahi
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
- $UMULH $ahi,$aj,$m0
-
addc $lo1,$nlo,$hi1
+ $UMULH $ahi,$aj,$m0
addze $hi1,$nhi
$UMULL $nlo,$nj,$m1 ; np[j]*m1
- $UMULH $nhi,$nj,$m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
+ $UMULH $nhi,$nj,$m1
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
$LD $aj,$BNSZ($ap) ; ap[1]
$LD $nj,0($np) ; np[0]
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi0,$hi0
-
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
-
- $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
$UMULH $ahi,$aj,$m0
-
$UMULL $lo1,$nj,$m1 ; np[0]*m1
$UMULH $hi1,$nj,$m1
$LD $nj,$BNSZ($np) ; np[1]
addc $lo1,$lo1,$lo0
- addze $hi1,$hi1
-
$UMULL $nlo,$nj,$m1 ; np[1]*m1
+ addze $hi1,$hi1
$UMULH $nhi,$nj,$m1
\f
mtctr $num
.align 4
Linner:
$LDX $aj,$ap,$j ; ap[j]
- $LD $tj,$BNSZ($tp) ; tp[j]
addc $lo0,$alo,$hi0
+ $LD $tj,$BNSZ($tp) ; tp[j]
addze $hi0,$ahi
$LDX $nj,$np,$j ; np[j]
- addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
- addze $hi0,$hi0
- $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
- $UMULH $ahi,$aj,$m0
-
addc $lo1,$nlo,$hi1
+ $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
addze $hi1,$nhi
+ $UMULH $ahi,$aj,$m0
+ addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
$UMULL $nlo,$nj,$m1 ; np[j]*m1
+ addze $hi0,$hi0
$UMULH $nhi,$nj,$m1
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
+ addi $j,$j,$BNSZ ; j++
addze $hi1,$hi1
$ST $lo1,0($tp) ; tp[j-1]
-
- addi $j,$j,$BNSZ ; j++
addi $tp,$tp,$BNSZ ; tp++
bdnz- Linner
;Linner
ble- Louter
\f
addi $num,$num,2 ; restore $num
+ subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
mtctr $num
+
+.align 4
+Lsub: $LDX $tj,$tp,$j
+ $LDX $nj,$np,$j
+ subfe $aj,$nj,$tj ; tp[j]-np[j]
+ $STX $aj,$rp,$j
+ addi $j,$j,$BNSZ
+ bdnz- Lsub
+
li $j,0
+ mtctr $num
+ subfe $ovf,$j,$ovf ; handle upmost overflow bit
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
- subfc. $ovf,$j,$ovf ; sets XER[CA]
- bne Lsub
- $UCMP $hi1,$nj
- bge Lsub
.align 4
-Lcopy:
- $LDX $tj,$tp,$j
+Lcopy: ; copy or in-place refresh
+ $LDX $tj,$ap,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
-Lexit:
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
li r3,1
blr
.long 0
-.align 4
-Lsub: $LDX $tj,$tp,$j
- $LDX $nj,$np,$j
- subfe $tj,$nj,$tj ; tp[j]-np[j]
- $STX $tj,$rp,$j
- addi $j,$j,$BNSZ
- bdnz- Lsub
- li $j,0
- subfe. $ovf,$j,$ovf
- mtctr $num
- bne Lcopy
-.align 4
-Lzap: $STX $j,$tp,$j
- addi $j,$j,$BNSZ
- bdnz- Lzap
- b Lexit
+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;