2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20 # Because unlike integer multiplier, which simply stalls whole CPU,
21 # FPU is fully pipelined and can effectively emit 48 bit partial
22 # product every cycle. Why not blended SPARC v9? One can argue that
23 # making this module dependent on UltraSPARC VIS extension limits its
24 # binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25 # implementations from compatibility matrix. But the rest, whole Sun
26 # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27 # VIS extension instructions used in this module. This is considered
28 # good enough to not care about HAL SPARC64 users [if any] who have
29 # integer-only pure SPARCv9 module to "fall down" to.
31 # USI&II cores currently exhibit uniform 2x improvement [over pre-
32 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33 # performance improves few percents for shorter keys and worsens few
34 # percents for longer keys. This is because USIII integer multiplier
35 # is >3x faster than USI&II one, which is harder to match [but see
36 # TODO list below]. It should also be noted that SPARC64 V features
37 # out-of-order execution, which *might* mean that integer multiplier
38 # is pipelined, which in turn *might* be impossible to match... On
39 # additional note, SPARC64 V implements FP Multiply-Add instruction,
40 # which is perfectly usable in this context... In other words, as far
41 # as Fujitsu SPARC64 V goes, talk to the author:-)
43 # The implementation implies following "non-natural" limitations on
45 # - num may not be less than 4;
46 # - num has to be even;
47 # Failure to meet either condition has no fatal effects, simply
48 # doesn't give any performance gain.
51 # - modulo-schedule inner loop for better performance (on in-order
52 # execution core such as UltraSPARC this shall result in further
53 # noticeable(!) improvement);
54 # - dedicated squaring procedure[?];
56 ######################################################################
59 # Modulo-scheduled inner loops allow to interleave floating point and
60 # integer instructions and minimize Read-After-Write penalties. This
61 # results in *further* 20-50% performance improvement [depending on
62 # key length, more for longer keys] on USI&II cores and 30-80% - on
65 # $output is the last argument if it looks like a file (it has an extension)
66 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
68 $output and open STDOUT,">$output";
70 $fname="bn_mul_mont_fpu";
76 # In order to provide for 32-/64-bit ABI duality, I keep integers wider
77 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
78 # exclusively for pointers, indexes and other small values...
80 $rp="%i0"; # BN_ULONG *rp,
81 $ap="%i1"; # const BN_ULONG *ap,
82 $bp="%i2"; # const BN_ULONG *bp,
83 $np="%i3"; # const BN_ULONG *np,
84 $n0="%i4"; # const BN_ULONG *n0,
85 $num="%i5"; # int num);
88 $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
89 $ap_h="%l2"; # to these four vectors as double-precision FP values.
90 $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
91 $np_h="%l4"; # loop and L1-cache aliasing is minimized...
94 $mask="%l7"; # 16-bit mask, 0xffff
96 $n0="%g4"; # reassigned(!) to "64-bit" register
97 $carry="%i4"; # %i4 reused(!) for a carry bit
99 # FP register naming chart
114 $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
115 $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
116 $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
117 $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
119 $dota="%f24"; $dotb="%f26";
121 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
122 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
123 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
124 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
126 $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
129 #include "sparc_arch.h"
131 .section ".text",#alloc,#execinstr
136 save %sp,-$frame-$locals,%sp
141 andcc $num,1,%g0 ! $num has to be even...
143 clr %i0 ! signal "unsupported input value"
146 sethi %hi(0xffff),$mask
147 ld [%i4+0],$n0 ! $n0 reassigned, remember?
148 or $mask,%lo(0xffff),$mask
151 or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
153 sll $num,3,$num ! num*=8
155 add %sp,$bias,%o0 ! real top of stack
157 add %o1,$num,%o1 ! %o1=num*5
159 and %o0,-2048,%o0 ! optimize TLB utilization
160 sub %o0,$bias,%sp ! alloca(5*num*8)
162 rd %asi,%o7 ! save %asi
163 add %sp,$bias+$frame+$locals,$tp
165 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vectors' ends !
170 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
172 add $rp,$num,$rp ! readjust input pointers to point
173 add $ap,$num,$ap ! at the ends too...
177 stx %o7,[%sp+$bias+$frame+48] ! save %asi
179 sub %g0,$num,$i ! i=-num
180 sub %g0,$num,$j ! j=-num
185 ld [%o3+4],%g1 ! bp[0]
187 ld [%o4+4],%g5 ! ap[0]
196 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
197 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
198 stx %o0,[%sp+$bias+$frame+0]
200 ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
204 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
209 ! transfer b[i] to FPU as 4x16-bit values
219 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
220 ldda [%sp+$bias+$frame+6]%asi,$na
222 ldda [%sp+$bias+$frame+4]%asi,$nb
224 ldda [%sp+$bias+$frame+2]%asi,$nc
226 ldda [%sp+$bias+$frame+0]%asi,$nd
229 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
233 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
243 faddd $aloa,$nloa,$nloa
246 faddd $alob,$nlob,$nlob
249 faddd $aloc,$nloc,$nloc
252 faddd $alod,$nlod,$nlod
255 faddd $ahia,$nhia,$nhia
258 faddd $ahib,$nhib,$nhib
261 faddd $ahic,$nhic,$dota ! $nhic
262 faddd $ahid,$nhid,$dotb ! $nhid
264 faddd $nloc,$nhia,$nloc
265 faddd $nlod,$nhib,$nlod
272 std $nloa,[%sp+$bias+$frame+0]
274 std $nlob,[%sp+$bias+$frame+8]
276 std $nloc,[%sp+$bias+$frame+16]
278 std $nlod,[%sp+$bias+$frame+24]
280 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
284 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
294 ldx [%sp+$bias+$frame+0],%o0
296 ldx [%sp+$bias+$frame+8],%o1
298 ldx [%sp+$bias+$frame+16],%o2
300 ldx [%sp+$bias+$frame+24],%o3
304 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
308 faddd $aloa,$nloa,$nloa
311 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
315 faddd $alob,$nlob,$nlob
319 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
320 faddd $aloc,$nloc,$nloc
330 !or %o7,%o0,%o0 ! 64-bit result
331 srlx %o3,16,%g1 ! 34-bit carry
334 faddd $alod,$nlod,$nlod
337 faddd $ahia,$nhia,$nhia
340 faddd $ahib,$nhib,$nhib
343 faddd $dota,$nloa,$nloa
344 faddd $dotb,$nlob,$nlob
345 faddd $ahic,$nhic,$dota ! $nhic
346 faddd $ahid,$nhid,$dotb ! $nhid
348 faddd $nloc,$nhia,$nloc
349 faddd $nlod,$nhib,$nlod
356 std $nloa,[%sp+$bias+$frame+0]
357 std $nlob,[%sp+$bias+$frame+8]
359 std $nloc,[%sp+$bias+$frame+16]
361 std $nlod,[%sp+$bias+$frame+24]
363 .align 32 ! incidentally already aligned !
367 ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
371 ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
381 ldx [%sp+$bias+$frame+0],%o0
383 ldx [%sp+$bias+$frame+8],%o1
385 ldx [%sp+$bias+$frame+16],%o2
387 ldx [%sp+$bias+$frame+24],%o3
391 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
395 faddd $aloa,$nloa,$nloa
398 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
402 faddd $alob,$nlob,$nlob
406 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
408 faddd $aloc,$nloc,$nloc
414 faddd $alod,$nlod,$nlod
420 faddd $ahia,$nhia,$nhia
424 or %o7,%o0,%o0 ! 64-bit result
425 faddd $ahib,$nhib,$nhib
428 faddd $dota,$nloa,$nloa
429 srlx %o3,16,%g1 ! 34-bit carry
430 faddd $dotb,$nlob,$nlob
434 stx %o0,[$tp] ! tp[j-1]=
436 faddd $ahic,$nhic,$dota ! $nhic
437 faddd $ahid,$nhid,$dotb ! $nhid
439 faddd $nloc,$nhia,$nloc
440 faddd $nlod,$nhib,$nlod
447 std $nloa,[%sp+$bias+$frame+0]
448 std $nlob,[%sp+$bias+$frame+8]
449 std $nloc,[%sp+$bias+$frame+16]
450 std $nlod,[%sp+$bias+$frame+24]
460 ldx [%sp+$bias+$frame+0],%o0
461 ldx [%sp+$bias+$frame+8],%o1
462 ldx [%sp+$bias+$frame+16],%o2
463 ldx [%sp+$bias+$frame+24],%o3
466 std $dota,[%sp+$bias+$frame+32]
468 std $dotb,[%sp+$bias+$frame+40]
472 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
481 or %o7,%o0,%o0 ! 64-bit result
482 ldx [%sp+$bias+$frame+32],%o4
484 ldx [%sp+$bias+$frame+40],%o5
485 srlx %o3,16,%g1 ! 34-bit carry
489 stx %o0,[$tp] ! tp[j-1]=
503 stx %o4,[$tp] ! tp[num-1]=
509 sub %g0,$num,$j ! j=-num
510 add %sp,$bias+$frame+$locals,$tp
515 ld [%o3+4],%g1 ! bp[i]
517 ld [%o4+4],%g5 ! ap[0]
524 ldx [$tp],%o2 ! tp[0]
527 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
528 stx %o0,[%sp+$bias+$frame+0]
530 ! transfer b[i] to FPU as 4x16-bit values
536 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
537 ldda [%sp+$bias+$frame+6]%asi,$na
539 ldda [%sp+$bias+$frame+4]%asi,$nb
541 ldda [%sp+$bias+$frame+2]%asi,$nc
543 ldda [%sp+$bias+$frame+0]%asi,$nd
545 ldd [$ap_l+$j],$alo ! load a[j] in double format
549 ldd [$np_l+$j],$nlo ! load n[j] in double format
559 faddd $aloa,$nloa,$nloa
562 faddd $alob,$nlob,$nlob
565 faddd $aloc,$nloc,$nloc
568 faddd $alod,$nlod,$nlod
571 faddd $ahia,$nhia,$nhia
574 faddd $ahib,$nhib,$nhib
577 faddd $ahic,$nhic,$dota ! $nhic
578 faddd $ahid,$nhid,$dotb ! $nhid
580 faddd $nloc,$nhia,$nloc
581 faddd $nlod,$nhib,$nlod
588 std $nloa,[%sp+$bias+$frame+0]
589 std $nlob,[%sp+$bias+$frame+8]
590 std $nloc,[%sp+$bias+$frame+16]
592 std $nlod,[%sp+$bias+$frame+24]
594 ldd [$ap_l+$j],$alo ! load a[j] in double format
596 ldd [$np_l+$j],$nlo ! load n[j] in double format
604 ldx [%sp+$bias+$frame+0],%o0
605 faddd $aloa,$nloa,$nloa
607 ldx [%sp+$bias+$frame+8],%o1
609 ldx [%sp+$bias+$frame+16],%o2
610 faddd $alob,$nlob,$nlob
612 ldx [%sp+$bias+$frame+24],%o3
616 faddd $aloc,$nloc,$nloc
621 faddd $alod,$nlod,$nlod
626 faddd $ahia,$nhia,$nhia
628 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
634 faddd $ahib,$nhib,$nhib
637 faddd $dota,$nloa,$nloa
639 faddd $dotb,$nlob,$nlob
642 faddd $ahic,$nhic,$dota ! $nhic
644 faddd $ahid,$nhid,$dotb ! $nhid
645 or %o7,%o0,%o0 ! 64-bit result
647 faddd $nloc,$nhia,$nloc
650 faddd $nlod,$nhib,$nlod
651 srlx %o3,16,%g1 ! 34-bit carry
660 std $nloa,[%sp+$bias+$frame+0]
661 std $nlob,[%sp+$bias+$frame+8]
663 std $nloc,[%sp+$bias+$frame+16]
664 bz,pn %icc,.Linnerskip
665 std $nlod,[%sp+$bias+$frame+24]
671 ldd [$ap_l+$j],$alo ! load a[j] in double format
673 ldd [$np_l+$j],$nlo ! load n[j] in double format
681 ldx [%sp+$bias+$frame+0],%o0
682 faddd $aloa,$nloa,$nloa
684 ldx [%sp+$bias+$frame+8],%o1
686 ldx [%sp+$bias+$frame+16],%o2
687 faddd $alob,$nlob,$nlob
689 ldx [%sp+$bias+$frame+24],%o3
693 faddd $aloc,$nloc,$nloc
698 faddd $alod,$nlod,$nlod
703 faddd $ahia,$nhia,$nhia
705 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
710 faddd $ahib,$nhib,$nhib
713 faddd $dota,$nloa,$nloa
715 faddd $dotb,$nlob,$nlob
718 faddd $ahic,$nhic,$dota ! $nhic
720 faddd $ahid,$nhid,$dotb ! $nhid
721 or %o7,%o0,%o0 ! 64-bit result
722 faddd $nloc,$nhia,$nloc
724 ldx [$tp+8],%o7 ! tp[j]
725 faddd $nlod,$nhib,$nlod
726 srlx %o3,16,%g1 ! 34-bit carry
736 stx %o0,[$tp] ! tp[j-1]
739 std $nloa,[%sp+$bias+$frame+0]
740 std $nlob,[%sp+$bias+$frame+8]
741 std $nloc,[%sp+$bias+$frame+16]
743 std $nlod,[%sp+$bias+$frame+24]
751 ldx [%sp+$bias+$frame+0],%o0
752 ldx [%sp+$bias+$frame+8],%o1
753 ldx [%sp+$bias+$frame+16],%o2
754 ldx [%sp+$bias+$frame+24],%o3
757 std $dota,[%sp+$bias+$frame+32]
759 std $dotb,[%sp+$bias+$frame+40]
763 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
772 ldx [%sp+$bias+$frame+32],%o4
773 or %o7,%o0,%o0 ! 64-bit result
774 ldx [%sp+$bias+$frame+40],%o5
776 ldx [$tp+8],%o7 ! tp[j]
777 srlx %o3,16,%g1 ! 34-bit carry
785 stx %o0,[$tp] ! tp[j-1]
799 stx %o4,[$tp] ! tp[num-1]
808 add $tp,8,$tp ! adjust tp to point at the end
810 sub %g0,$num,%o7 ! n=-num
812 subcc %g0,%g0,%g0 ! clear %icc.c
829 sub %g0,$num,%o7 ! n=-num
850 sub %g0,$num,%o7 ! n=-num
861 ldx [%sp+$bias+$frame+48],%o7
862 wr %g0,%o7,%asi ! restore %asi
868 .type $fname,#function
869 .size $fname,(.-$fname)
870 .asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
874 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
876 # Below substitution makes it possible to compile without demanding
877 # VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
878 # dare to do this, because VIS capability is detected at run-time now
879 # and this routine is not called on CPU not capable to execute it. Do
880 # note that fzeros is not the only VIS dependency! Another dependency
881 # is implicit and is just _a_ numerical value loaded to %asi register,
882 # which assembler can't recognize as VIS specific...
883 $code =~ s/fzeros\s+%f([0-9]+)/
884 sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)