# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# April 2006
# 2048-bit +18%
# 4096-bit +4%
-$output = shift;
+$flavour = shift;
-if ($output =~ /32\-mont\.s/) {
+if ($flavour =~ /32/) {
$BITS= 32;
$BNSZ= $BITS/8;
$SIZE_T=4;
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
+ $SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
-} elsif ($output =~ /64\-mont\.s/) {
+} elsif ($flavour =~ /64/) {
$BITS= 64;
$BNSZ= $BITS/8;
$SIZE_T=8;
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
+ $SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
-} else { die "nonsense $output"; }
+} else { die "nonsense $flavour"; }
-( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
- die "can't call ../perlasm/ppc-xlate.pl: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$sp="r1";
$toc="r2";
.machine "any"
.text
-.globl .bn_mul_mont
+.globl .bn_mul_mont_int
.align 4
-.bn_mul_mont:
+.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+ cmpwi $num,32 ; longer key performance is not better
+ bgelr
+___
+$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,`$FRAME+$RZONE`
ble- Louter
\f
addi $num,$num,2 ; restore $num
+ subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
mtctr $num
+
+.align 4
+Lsub: $LDX $tj,$tp,$j
+ $LDX $nj,$np,$j
+ subfe $aj,$nj,$tj ; tp[j]-np[j]
+ $STX $aj,$rp,$j
+ addi $j,$j,$BNSZ
+ bdnz- Lsub
+
li $j,0
+ mtctr $num
+ subfe $ovf,$j,$ovf ; handle upmost overflow bit
+ and $ap,$tp,$ovf
+ andc $np,$rp,$ovf
+ or $ap,$ap,$np ; ap=borrow?tp:rp
- subfc. $ovf,$j,$ovf ; sets XER[CA]
- bne Lsub
- $UCMP $hi1,$nj
- bge Lsub
.align 4
-Lcopy:
- $LDX $tj,$tp,$j
+Lcopy: ; copy or in-place refresh
+ $LDX $tj,$ap,$j
$STX $tj,$rp,$j
$STX $j,$tp,$j ; zap at once
addi $j,$j,$BNSZ
bdnz- Lcopy
-Lexit:
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
li r3,1
blr
.long 0
-.align 4
-Lsub: $LDX $tj,$tp,$j
- $LDX $nj,$np,$j
- subfe $tj,$nj,$tj ; tp[j]-np[j]
- $STX $tj,$rp,$j
- addi $j,$j,$BNSZ
- bdnz- Lsub
- li $j,0
- subfe. $ovf,$j,$ovf
- mtctr $num
- bne Lcopy
-.align 4
-Lzap: $STX $j,$tp,$j
- addi $j,$j,$BNSZ
- bdnz- Lzap
- b Lexit
+.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;