2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20 # Because unlike integer multiplier, which simply stalls whole CPU,
21 # FPU is fully pipelined and can effectively emit 48 bit partial
22 # product every cycle. Why not blended SPARC v9? One can argue that
23 # making this module dependent on UltraSPARC VIS extension limits its
24 # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25 # implementations from compatibility matrix. But the rest, whole Sun
26 # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27 # VIS extension instructions used in this module. This is considered
28 # good enough to not care about HAL SPARC64 users [if any] who have
29 # integer-only pure SPARCv9 module to "fall down" to.
31 # USI&II cores currently exhibit uniform 2x improvement [over pre-
32 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33 # performance improves few percents for shorter keys and worsens few
34 # percents for longer keys. This is because USIII integer multiplier
35 # is >3x faster than USI&II one, which is harder to match [but see
36 # TODO list below]. It should also be noted that SPARC64 V features
37 # out-of-order execution, which *might* mean that integer multiplier
38 # is pipelined, which in turn *might* be impossible to match... On
39 # additional note, SPARC64 V implements FP Multiply-Add instruction,
40 # which is perfectly usable in this context... In other words, as far
41 # as Fujitsu SPARC64 V goes, talk to the author:-)
43 # The implementation implies following "non-natural" limitations on
45 # - num may not be less than 4;
46 # - num has to be even;
47 # Failure to meet either condition has no fatal effects, simply
48 # doesn't give any performance gain.
51 # - modulo-schedule inner loop for better performance (on in-order
52 # execution core such as UltraSPARC this shall result in further
53 # noticeable(!) improvement);
54 # - dedicated squaring procedure[?];
56 ######################################################################
59 # Modulo-scheduled inner loops allow to interleave floating point and
60 # integer instructions and minimize Read-After-Write penalties. This
61 # results in *further* 20-50% performance improvement [depending on
62 # key length, more for longer keys] on USI&II cores and 30-80% - on
66 open STDOUT,">$output";
68 $fname="bn_mul_mont_fpu";
74 # In order to provide for 32-/64-bit ABI duality, I keep integers wider
75 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
76 # exclusively for pointers, indexes and other small values...
78 $rp="%i0"; # BN_ULONG *rp,
79 $ap="%i1"; # const BN_ULONG *ap,
80 $bp="%i2"; # const BN_ULONG *bp,
81 $np="%i3"; # const BN_ULONG *np,
82 $n0="%i4"; # const BN_ULONG *n0,
83 $num="%i5"; # int num);
86 $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
87 $ap_h="%l2"; # to these four vectors as double-precision FP values.
88 $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
89 $np_h="%l4"; # loop and L1-cache aliasing is minimized...
92 $mask="%l7"; # 16-bit mask, 0xffff
94 $n0="%g4"; # reassigned(!) to "64-bit" register
95 $carry="%i4"; # %i4 reused(!) for a carry bit
97 # FP register naming chart
112 $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
113 $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
114 $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
115 $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
117 $dota="%f24"; $dotb="%f26";
119 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
120 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
121 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
122 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
124 $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
127 #include "sparc_arch.h"
129 .section ".text",#alloc,#execinstr
134 save %sp,-$frame-$locals,%sp
139 andcc $num,1,%g0 ! $num has to be even...
141 clr %i0 ! signal "unsupported input value"
144 sethi %hi(0xffff),$mask
145 ld [%i4+0],$n0 ! $n0 reassigned, remember?
146 or $mask,%lo(0xffff),$mask
149 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
151 sll $num,3,$num ! num*=8
153 add %sp,$bias,%o0 ! real top of stack
155 add %o1,$num,%o1 ! %o1=num*5
157 and %o0,-2048,%o0 ! optimize TLB utilization
158 sub %o0,$bias,%sp ! alloca(5*num*8)
160 rd %asi,%o7 ! save %asi
161 add %sp,$bias+$frame+$locals,$tp
163 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
168 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
170 add $rp,$num,$rp ! readjust input pointers to point
171 add $ap,$num,$ap ! at the ends too...
175 stx %o7,[%sp+$bias+$frame+48] ! save %asi
177 sub %g0,$num,$i ! i=-num
178 sub %g0,$num,$j ! j=-num
183 ld [%o3+4],%g1 ! bp[0]
185 ld [%o4+4],%g5 ! ap[0]
194 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
195 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
196 stx %o0,[%sp+$bias+$frame+0]
198 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
202 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
207 ! transfer b[i] to FPU as 4x16-bit values
217 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
218 ldda [%sp+$bias+$frame+6]%asi,$na
220 ldda [%sp+$bias+$frame+4]%asi,$nb
222 ldda [%sp+$bias+$frame+2]%asi,$nc
224 ldda [%sp+$bias+$frame+0]%asi,$nd
227 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
231 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
241 faddd $aloa,$nloa,$nloa
244 faddd $alob,$nlob,$nlob
247 faddd $aloc,$nloc,$nloc
250 faddd $alod,$nlod,$nlod
253 faddd $ahia,$nhia,$nhia
256 faddd $ahib,$nhib,$nhib
259 faddd $ahic,$nhic,$dota ! $nhic
260 faddd $ahid,$nhid,$dotb ! $nhid
262 faddd $nloc,$nhia,$nloc
263 faddd $nlod,$nhib,$nlod
270 std $nloa,[%sp+$bias+$frame+0]
272 std $nlob,[%sp+$bias+$frame+8]
274 std $nloc,[%sp+$bias+$frame+16]
276 std $nlod,[%sp+$bias+$frame+24]
278 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
282 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
292 ldx [%sp+$bias+$frame+0],%o0
294 ldx [%sp+$bias+$frame+8],%o1
296 ldx [%sp+$bias+$frame+16],%o2
298 ldx [%sp+$bias+$frame+24],%o3
302 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
306 faddd $aloa,$nloa,$nloa
309 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
313 faddd $alob,$nlob,$nlob
317 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
318 faddd $aloc,$nloc,$nloc
328 !or %o7,%o0,%o0 ! 64-bit result
329 srlx %o3,16,%g1 ! 34-bit carry
332 faddd $alod,$nlod,$nlod
335 faddd $ahia,$nhia,$nhia
338 faddd $ahib,$nhib,$nhib
341 faddd $dota,$nloa,$nloa
342 faddd $dotb,$nlob,$nlob
343 faddd $ahic,$nhic,$dota ! $nhic
344 faddd $ahid,$nhid,$dotb ! $nhid
346 faddd $nloc,$nhia,$nloc
347 faddd $nlod,$nhib,$nlod
354 std $nloa,[%sp+$bias+$frame+0]
355 std $nlob,[%sp+$bias+$frame+8]
357 std $nloc,[%sp+$bias+$frame+16]
359 std $nlod,[%sp+$bias+$frame+24]
361 .align 32 ! incidentally already aligned !
365 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
369 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
379 ldx [%sp+$bias+$frame+0],%o0
381 ldx [%sp+$bias+$frame+8],%o1
383 ldx [%sp+$bias+$frame+16],%o2
385 ldx [%sp+$bias+$frame+24],%o3
389 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
393 faddd $aloa,$nloa,$nloa
396 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
400 faddd $alob,$nlob,$nlob
404 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
406 faddd $aloc,$nloc,$nloc
412 faddd $alod,$nlod,$nlod
418 faddd $ahia,$nhia,$nhia
422 or %o7,%o0,%o0 ! 64-bit result
423 faddd $ahib,$nhib,$nhib
426 faddd $dota,$nloa,$nloa
427 srlx %o3,16,%g1 ! 34-bit carry
428 faddd $dotb,$nlob,$nlob
432 stx %o0,[$tp] ! tp[j-1]=
434 faddd $ahic,$nhic,$dota ! $nhic
435 faddd $ahid,$nhid,$dotb ! $nhid
437 faddd $nloc,$nhia,$nloc
438 faddd $nlod,$nhib,$nlod
445 std $nloa,[%sp+$bias+$frame+0]
446 std $nlob,[%sp+$bias+$frame+8]
447 std $nloc,[%sp+$bias+$frame+16]
448 std $nlod,[%sp+$bias+$frame+24]
458 ldx [%sp+$bias+$frame+0],%o0
459 ldx [%sp+$bias+$frame+8],%o1
460 ldx [%sp+$bias+$frame+16],%o2
461 ldx [%sp+$bias+$frame+24],%o3
464 std $dota,[%sp+$bias+$frame+32]
466 std $dotb,[%sp+$bias+$frame+40]
470 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
479 or %o7,%o0,%o0 ! 64-bit result
480 ldx [%sp+$bias+$frame+32],%o4
482 ldx [%sp+$bias+$frame+40],%o5
483 srlx %o3,16,%g1 ! 34-bit carry
487 stx %o0,[$tp] ! tp[j-1]=
501 stx %o4,[$tp] ! tp[num-1]=
507 sub %g0,$num,$j ! j=-num
508 add %sp,$bias+$frame+$locals,$tp
513 ld [%o3+4],%g1 ! bp[i]
515 ld [%o4+4],%g5 ! ap[0]
522 ldx [$tp],%o2 ! tp[0]
525 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
526 stx %o0,[%sp+$bias+$frame+0]
528 ! transfer b[i] to FPU as 4x16-bit values
534 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
535 ldda [%sp+$bias+$frame+6]%asi,$na
537 ldda [%sp+$bias+$frame+4]%asi,$nb
539 ldda [%sp+$bias+$frame+2]%asi,$nc
541 ldda [%sp+$bias+$frame+0]%asi,$nd
543 ldd [$ap_l+$j],$alo ! load a[j] in double format
547 ldd [$np_l+$j],$nlo ! load n[j] in double format
557 faddd $aloa,$nloa,$nloa
560 faddd $alob,$nlob,$nlob
563 faddd $aloc,$nloc,$nloc
566 faddd $alod,$nlod,$nlod
569 faddd $ahia,$nhia,$nhia
572 faddd $ahib,$nhib,$nhib
575 faddd $ahic,$nhic,$dota ! $nhic
576 faddd $ahid,$nhid,$dotb ! $nhid
578 faddd $nloc,$nhia,$nloc
579 faddd $nlod,$nhib,$nlod
586 std $nloa,[%sp+$bias+$frame+0]
587 std $nlob,[%sp+$bias+$frame+8]
588 std $nloc,[%sp+$bias+$frame+16]
590 std $nlod,[%sp+$bias+$frame+24]
592 ldd [$ap_l+$j],$alo ! load a[j] in double format
594 ldd [$np_l+$j],$nlo ! load n[j] in double format
602 ldx [%sp+$bias+$frame+0],%o0
603 faddd $aloa,$nloa,$nloa
605 ldx [%sp+$bias+$frame+8],%o1
607 ldx [%sp+$bias+$frame+16],%o2
608 faddd $alob,$nlob,$nlob
610 ldx [%sp+$bias+$frame+24],%o3
614 faddd $aloc,$nloc,$nloc
619 faddd $alod,$nlod,$nlod
624 faddd $ahia,$nhia,$nhia
626 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
632 faddd $ahib,$nhib,$nhib
635 faddd $dota,$nloa,$nloa
637 faddd $dotb,$nlob,$nlob
640 faddd $ahic,$nhic,$dota ! $nhic
642 faddd $ahid,$nhid,$dotb ! $nhid
643 or %o7,%o0,%o0 ! 64-bit result
645 faddd $nloc,$nhia,$nloc
648 faddd $nlod,$nhib,$nlod
649 srlx %o3,16,%g1 ! 34-bit carry
658 std $nloa,[%sp+$bias+$frame+0]
659 std $nlob,[%sp+$bias+$frame+8]
661 std $nloc,[%sp+$bias+$frame+16]
662 bz,pn %icc,.Linnerskip
663 std $nlod,[%sp+$bias+$frame+24]
669 ldd [$ap_l+$j],$alo ! load a[j] in double format
671 ldd [$np_l+$j],$nlo ! load n[j] in double format
679 ldx [%sp+$bias+$frame+0],%o0
680 faddd $aloa,$nloa,$nloa
682 ldx [%sp+$bias+$frame+8],%o1
684 ldx [%sp+$bias+$frame+16],%o2
685 faddd $alob,$nlob,$nlob
687 ldx [%sp+$bias+$frame+24],%o3
691 faddd $aloc,$nloc,$nloc
696 faddd $alod,$nlod,$nlod
701 faddd $ahia,$nhia,$nhia
703 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
708 faddd $ahib,$nhib,$nhib
711 faddd $dota,$nloa,$nloa
713 faddd $dotb,$nlob,$nlob
716 faddd $ahic,$nhic,$dota ! $nhic
718 faddd $ahid,$nhid,$dotb ! $nhid
719 or %o7,%o0,%o0 ! 64-bit result
720 faddd $nloc,$nhia,$nloc
722 ldx [$tp+8],%o7 ! tp[j]
723 faddd $nlod,$nhib,$nlod
724 srlx %o3,16,%g1 ! 34-bit carry
734 stx %o0,[$tp] ! tp[j-1]
737 std $nloa,[%sp+$bias+$frame+0]
738 std $nlob,[%sp+$bias+$frame+8]
739 std $nloc,[%sp+$bias+$frame+16]
741 std $nlod,[%sp+$bias+$frame+24]
749 ldx [%sp+$bias+$frame+0],%o0
750 ldx [%sp+$bias+$frame+8],%o1
751 ldx [%sp+$bias+$frame+16],%o2
752 ldx [%sp+$bias+$frame+24],%o3
755 std $dota,[%sp+$bias+$frame+32]
757 std $dotb,[%sp+$bias+$frame+40]
761 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
770 ldx [%sp+$bias+$frame+32],%o4
771 or %o7,%o0,%o0 ! 64-bit result
772 ldx [%sp+$bias+$frame+40],%o5
774 ldx [$tp+8],%o7 ! tp[j]
775 srlx %o3,16,%g1 ! 34-bit carry
783 stx %o0,[$tp] ! tp[j-1]
797 stx %o4,[$tp] ! tp[num-1]
806 add $tp,8,$tp ! adjust tp to point at the end
808 sub %g0,$num,%o7 ! n=-num
810 subcc %g0,%g0,%g0 ! clear %icc.c
827 sub %g0,$num,%o7 ! n=-num
848 sub %g0,$num,%o7 ! n=-num
859 ldx [%sp+$bias+$frame+48],%o7
860 wr %g0,%o7,%asi ! restore %asi
866 .type $fname,#function
867 .size $fname,(.-$fname)
868 .asciz "Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
872 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
874 # Below substitution makes it possible to compile without demanding
875 # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
876 # dare to do this, because VIS capability is detected at run-time now
877 # and this routine is not called on CPU not capable to execute it. Do
878 # note that fzeros is not the only VIS dependency! Another dependency
879 # is implicit and is just _a_ numerical value loaded to %asi register,
880 # which assembler can't recognize as VIS specific...
881 $code =~ s/fzeros\s+%f([0-9]+)/
882 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)