--- /dev/null
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+
+# (*) Engine accessing the driver in question is on my TODO list.
+# For reference, acceleator is estimated to give 6 to 10 times
+# improvement on single-threaded RSA sign. It should be noted
+# that 6-10x improvement coefficient does not actually mean
+# something extraordinary in terms of absolute [single-threaded]
+# performance, as SPARCv9 instruction set is by all means least
+# suitable for high performance crypto among other 64 bit
+# platforms. 6-10x factor simply places T1 in same performance
+# domain as say AMD64 and IA-64. Improvement of RSA verify don't
+# appear impressive at all, but it's the sign operation which is
+# far more critical/interesting.
+
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+
+# int bn_mul_mont(
+$rp="%i0"; # BN_ULONG *rp,
+$ap="%i1"; # const BN_ULONG *ap,
+$bp="%i2"; # const BN_ULONG *bp,
+$np="%i3"; # const BN_ULONG *np,
+$n0="%i4"; # const BN_ULONG *n0,
+$num="%i5"; # int num);
+
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64) { $bias=2047; $frame=192; }
+else { $bias=0; $frame=128; }
+
+$car0="%o0";
+$car1="%o1";
+$car2="%o2"; # 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1"; # 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+
+$fname="bn_mul_mont";
+
+$code=<<___;
+.section ".text",#alloc,#execinstr
+
+.global $fname
+.align 32
+$fname:
+ cmp %o5,4 ! 128 bits minimum
+ bge,pt %icc,.Lenter
+ sethi %hi(0xffffffff),$mask
+ retl
+ clr %o0
+.align 32
+.Lenter:
+ save %sp,-$frame,%sp
+ sll $num,2,$num ! num*=4
+ or $mask,%lo(0xffffffff),$mask
+ ld [$n0],$n0
+ cmp $ap,$bp
+ and $num,$mask,$num
+ ld [$bp],$mul0 ! bp[0]
+ be,pt `$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+ nop
+
+ add %sp,$bias,%o7 ! real top of stack
+ ld [$ap],$car0 ! ap[0]
+ sub %o7,$num,%o7
+ ld [$ap+4],$apj ! ap[1]
+ and %o7,-1024,%o7
+ ld [$np],$car1 ! np[0]
+ sub %o7,$bias,%sp ! alloca
+ ld [$np+4],$npj ! np[1]
+ mov 12,$j
+
+ mulx $car0,$mul0,$car0 ! ap[0]*bp[0]
+ mulx $apj,$mul0,$tmp0 !prologue! ap[1]*bp[0]
+ and $car0,$mask,$acc0
+ add %sp,$bias+$frame,$tp
+ ld [$ap+8],$apj !prologue!
+
+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
+ mulx $npj,$mul1,$acc1 !prologue! np[1]*"t[0]"*n0
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ ld [$np+8],$npj !prologue!
+ srlx $car1,32,$car1
+ mov $tmp0,$acc0 !prologue!
+
+.L1st:
+ mulx $apj,$mul0,$tmp0
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ add $j,4,$j ! j++
+ mov $tmp0,$acc0
+ st $car1,[$tp]
+ cmp $j,$num
+ mov $tmp1,$acc1
+ srlx $car1,32,$car1
+ bl %icc,.L1st
+ add $tp,4,$tp ! tp++
+!.L1st
+
+ mulx $apj,$mul0,$tmp0 !epilogue!
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0
+ and $car0,$mask,$acc0
+ add $acc1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $tmp0,$car0,$car0
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car1
+
+ add $car0,$car1,$car1
+ st $car1,[$tp+8]
+ srlx $car1,32,$car2
+\f
+ mov 4,$i ! i++
+ ld [$bp+4],$mul0 ! bp[1]
+.Louter:
+ add %sp,$bias+$frame,$tp
+ ld [$ap],$car0 ! ap[0]
+ ld [$ap+4],$apj ! ap[1]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ ld [$tp],$tmp1 ! tp[0]
+ ld [$tp+4],$tpj ! tp[1]
+ mov 12,$j
+
+ mulx $car0,$mul0,$car0
+ mulx $apj,$mul0,$tmp0 !prologue!
+ add $tmp1,$car0,$car0
+ ld [$ap+8],$apj !prologue!
+ and $car0,$mask,$acc0
+
+ mulx $n0,$acc0,$mul1
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1
+ mulx $npj,$mul1,$acc1 !prologue!
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ ld [$np+8],$npj !prologue!
+ srlx $car1,32,$car1
+ mov $tmp0,$acc0 !prologue!
+
+.Linner:
+ mulx $apj,$mul0,$tmp0
+ mulx $npj,$mul1,$tmp1
+ add $tpj,$car0,$car0
+ ld [$ap+$j],$apj ! ap[j]
+ add $acc0,$car0,$car0
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj ! np[j]
+ and $car0,$mask,$acc0
+ ld [$tp+8],$tpj ! tp[j]
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ add $j,4,$j ! j++
+ mov $tmp0,$acc0
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+ mov $tmp1,$acc1
+ cmp $j,$num
+ bl %icc,.Linner
+ add $tp,4,$tp ! tp++
+!.Linner
+
+ mulx $apj,$mul0,$tmp0 !epilogue!
+ mulx $npj,$mul1,$tmp1
+ add $tpj,$car0,$car0
+ add $acc0,$car0,$car0
+ ld [$tp+8],$tpj ! tp[j]
+ and $car0,$mask,$acc0
+ add $acc1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+
+ add $tpj,$car0,$car0
+ add $tmp0,$car0,$car0
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ add $acc0,$car1,$car1
+ st $car1,[$tp+4] ! tp[j-1]
+ srlx $car0,32,$car0
+ add $i,4,$i ! i++
+ srlx $car1,32,$car1
+
+ add $car0,$car1,$car1
+ cmp $i,$num
+ add $car2,$car1,$car1
+ st $car1,[$tp+8]
+
+ srlx $car1,32,$car2
+ bl,a %icc,.Louter
+ ld [$bp+$i],$mul0 ! bp[i]
+!.Louter
+
+ add $tp,12,$tp
+\f
+.Ltail:
+ add $np,$num,$np
+ add $rp,$num,$rp
+
+ cmp $car2,0 ! clears %icc.c
+ bne,pn %icc,.Lsub
+ sub %g0,$num,%o7 ! k=-num
+
+ cmp $car1,$npj ! compare top-most $tp and $np words
+ bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
+ nop
+
+.align 16,0x1000000
+.Lsub:
+ ld [$tp+%o7],%o0
+ ld [$np+%o7],%o1
+ subccc %o0,%o1,%o1
+ st %o1,[$rp+%o7]
+ add %o7,4,%o7
+ brnz %o7,.Lsub
+ nop
+ subccc $car2,0,$car2
+ bcc %icc,.Lzap
+ sub %g0,$num,%o7
+
+.align 16,0x1000000
+.Lcopy:
+ ld [$tp+%o7],%o0
+ st %o0,[$rp+%o7]
+ add %o7,4,%o7
+ brnz %o7,.Lcopy
+ nop
+ ba .Lzap
+ sub %g0,$num,%o7
+
+.align 32
+.Lzap:
+ st %g0,[$tp+%o7]
+ add %o7,4,%o7
+ brnz %o7,.Lzap
+ nop
+ mov 1,%i0
+ ret
+ restore
+___
+\f
+########
+######## bn_sqr_mont gives up to 20% improvement over above code
+########
+$sbit="%i2"; # re-use $bp!
+
+$code.=<<___;
+.align 32
+.Lbn_sqr_mont:
+ add %sp,$bias,%o7 ! real top of stack
+ ld [$ap+4],$apj ! ap[1]
+ sub %o7,$num,%o7
+ ld [$np],$car1 ! np[0]
+ and %o7,-1024,%o7
+ ld [$np+4],$npj ! np[1]
+ sub %o7,$bias,%sp ! alloca
+ mov 12,$j
+
+ mulx $mul0,$mul0,$car0 ! ap[0]*ap[0]
+ mulx $apj,$mul0,$tmp0 !prologue!
+ and $car0,$mask,$acc0
+ add %sp,$bias+$frame,$tp
+ ld [$ap+8],$apj !prologue!
+
+ mulx $n0,$acc0,$mul1 ! "t[0]"*n0
+ srlx $car0,32,$car0
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1 ! np[0]*"t[0]"*n0
+ mulx $npj,$mul1,$acc1 !prologue!
+ and $car0,1,$sbit
+ ld [$np+8],$npj !prologue!
+ srlx $car0,1,$car0
+ add $acc0,$car1,$car1
+ srlx $car1,32,$car1
+ mov $tmp0,$acc0 !prologue!
+
+.Lsqr_1st:
+ mulx $apj,$mul0,$tmp0
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0 ! ap[j]*a0+c0
+ add $acc1,$car1,$car1
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ mov $tmp1,$acc1
+ srlx $acc0,32,$sbit
+ add $j,4,$j ! j++
+ and $acc0,$mask,$acc0
+ cmp $j,$num
+ add $acc0,$car1,$car1
+ st $car1,[$tp]
+ mov $tmp0,$acc0
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_1st
+ add $tp,4,$tp ! tp++
+!.Lsqr_1st
+
+ mulx $apj,$mul0,$tmp0 ! epilogue
+ mulx $npj,$mul1,$tmp1
+ add $acc0,$car0,$car0 ! ap[j]*a0+c0
+ add $acc1,$car1,$car1
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $tmp0,$car0,$car0 ! ap[j]*a0+c0
+ add $tmp1,$car1,$car1
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ st $car1,[$tp+8]
+ srlx $car1,32,$car2
+\f
+ ld [%sp+$bias+$frame],$tmp0 ! tp[0]
+ ld [%sp+$bias+$frame+4],$tmp1 ! tp[1]
+ ld [%sp+$bias+$frame+8],$tpj ! tp[2]
+ ld [$ap+4],$mul0 ! ap[1]
+ ld [$ap+8],$apj ! ap[2]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ mulx $n0,$tmp0,$mul1
+
+ mulx $mul0,$mul0,$car0
+ and $mul1,$mask,$mul1
+
+ mulx $car1,$mul1,$car1
+ mulx $npj,$mul1,$acc1
+ add $tmp0,$car1,$car1
+ and $car0,$mask,$acc0
+ ld [$np+8],$npj ! np[2]
+ srlx $car1,32,$car1
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add $acc0,$car1,$car1
+ and $car0,1,$sbit
+ add $acc1,$car1,$car1
+ srlx $car0,1,$car0
+ mov 12,$j
+ st $car1,[%sp+$bias+$frame] ! tp[0]=
+ srlx $car1,32,$car1
+ add %sp,$bias+$frame+4,$tp
+
+.Lsqr_2nd:
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $acc0,$car0,$car0
+ add $tpj,$car1,$car1
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc1,$car1,$car1
+ ld [$tp+8],$tpj ! tp[j]
+ add $acc0,$acc0,$acc0
+ add $j,4,$j ! j++
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ cmp $j,$num
+ add $acc0,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_2nd
+ add $tp,4,$tp ! tp++
+!.Lsqr_2nd
+
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $acc0,$car0,$car0
+ add $tpj,$car1,$car1
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc1,$car1,$car1
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ add $car2,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car2
+\f
+ ld [%sp+$bias+$frame],$tmp1 ! tp[0]
+ ld [%sp+$bias+$frame+4],$tpj ! tp[1]
+ ld [$ap+8],$mul0 ! ap[2]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ mulx $n0,$tmp1,$mul1
+ and $mul1,$mask,$mul1
+ mov 8,$i
+
+ mulx $mul0,$mul0,$car0
+ mulx $car1,$mul1,$car1
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add %sp,$bias+$frame,$tp
+ srlx $car1,32,$car1
+ and $car0,1,$sbit
+ srlx $car0,1,$car0
+ mov 4,$j
+
+.Lsqr_outer:
+.Lsqr_inner1:
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $j,4,$j
+ ld [$tp+8],$tpj
+ cmp $j,$i
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_inner1
+ add $tp,4,$tp
+!.Lsqr_inner1
+
+ add $j,4,$j
+ ld [$ap+$j],$apj ! ap[j]
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ ld [$np+$j],$npj ! np[j]
+ add $acc0,$car1,$car1
+ ld [$tp+8],$tpj ! tp[j]
+ add $acc1,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $j,4,$j
+ cmp $j,$num
+ be,pn %icc,.Lsqr_no_inner2
+ add $tp,4,$tp
+
+.Lsqr_inner2:
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $acc0,$car0,$car0
+ ld [$ap+$j],$apj ! ap[j]
+ and $car0,$mask,$acc0
+ ld [$np+$j],$npj ! np[j]
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ ld [$tp+8],$tpj ! tp[j]
+ or $sbit,$acc0,$acc0
+ add $j,4,$j ! j++
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ cmp $j,$num
+ add $acc0,$car1,$car1
+ add $acc1,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_inner2
+ add $tp,4,$tp ! tp++
+
+.Lsqr_no_inner2:
+ mulx $apj,$mul0,$acc0
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $acc0,$car0,$car0
+ and $car0,$mask,$acc0
+ srlx $car0,32,$car0
+ add $acc0,$acc0,$acc0
+ or $sbit,$acc0,$acc0
+ srlx $acc0,32,$sbit
+ and $acc0,$mask,$acc0
+ add $acc0,$car1,$car1
+ add $acc1,$car1,$car1
+ st $car1,[$tp] ! tp[j-1]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ add $car2,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car2
+\f
+ add $i,4,$i ! i++
+ ld [%sp+$bias+$frame],$tmp1 ! tp[0]
+ ld [%sp+$bias+$frame+4],$tpj ! tp[1]
+ ld [$ap+$i],$mul0 ! ap[j]
+ ld [$np],$car1 ! np[0]
+ ld [$np+4],$npj ! np[1]
+ mulx $n0,$tmp1,$mul1
+ and $mul1,$mask,$mul1
+ add $i,4,$tmp0
+
+ mulx $mul0,$mul0,$car0
+ mulx $car1,$mul1,$car1
+ and $car0,$mask,$acc0
+ add $tmp1,$car1,$car1
+ srlx $car0,32,$car0
+ add %sp,$bias+$frame,$tp
+ srlx $car1,32,$car1
+ and $car0,1,$sbit
+ srlx $car0,1,$car0
+
+ cmp $tmp0,$num ! i<num-1
+ bl %icc,.Lsqr_outer
+ mov 4,$j
+\f
+.Lsqr_last:
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $j,4,$j
+ ld [$tp+8],$tpj
+ cmp $j,$i
+ add $acc1,$car1,$car1
+ ld [$np+$j],$npj
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+ bl %icc,.Lsqr_last
+ add $tp,4,$tp
+!.Lsqr_last
+
+ mulx $npj,$mul1,$acc1
+ add $tpj,$car1,$car1
+ add $acc0,$car1,$car1
+ add $acc1,$car1,$car1
+ st $car1,[$tp]
+ srlx $car1,32,$car1
+
+ add $car0,$car0,$car0 ! recover $car0
+ or $sbit,$car0,$car0
+ add $car0,$car1,$car1
+ add $car2,$car1,$car1
+ st $car1,[$tp+4]
+ srlx $car1,32,$car2
+
+ ba .Ltail
+ add $tp,8,$tp
+.type $fname,#function
+.size $fname,(.-$fname)
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;