3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
9 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
10 # Because unlike integer multiplier, which simply stalls whole CPU,
11 # FPU is fully pipelined and can effectively emit 48 bit partial
12 # product every cycle. Why not blended SPARC v9? One can argue that
13 # making this module dependent on UltraSPARC VIS extension limits its
14 # binary compatibility. Very well may be, but the simple fact is that
15 # there is no known SPARC v9 implementation, which does not implement
16 # VIS. Even brand new Fujitsu's SPARC64 V is equipped with VIS unit.
18 # USI&II cores currently exhibit uniform 2x improvement [over pre-
19 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
20 # performance improves few percents for shorter keys and worsens few
21 # percents for longer keys. This's because USIII integer multiplier
22 # is >3x faster than USI&II one, which is harder to match [but see
23 # TODO list below]. It should also be noted that SPARC64 V features
24 # out-of-order execution, which *might* mean that integer multiplier
25 # is pipelined, which in turn *might* be impossible to match...
28 # - complete 32-bit adaptation (requires universal changes to
29 # BN_MONT_CTX and bn_mul_mont prototype, but nothing really
31 # - modulo-schedule inner loop for better performance (on in-order
32 # execution core such as UltraSPARC this shall result in further
33 # noticeable(!) improvement);
34 # - dedicated squaring procedure[?];
39 $bits=64 if (/\-m64/ || /\-xarch\=v9/);
40 $vis=1 if (/\-mcpu=ultra/ || /\-xarch\=v[9|8plus]\S/);
43 if (!$vis || $bits==32) { # 32-bit is not supported just yet...
45 .section ".text",#alloc,#execinstr
49 xor %o0,%o0,%o0 ! just signal "not implemented"
50 .type $fname,#function
51 .size $fname,(.-$fname)
61 $frame=128; # 96 rounded up to largest known cache-line
65 # In order to provide for 32-/64-bit ABI duality, I keep integers wider
66 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
67 # exclusively for pointers, indexes and other small values...
69 $rp="%i0"; # BN_ULONG *rp,
70 $ap="%i1"; # const BN_ULONG *ap,
71 $bp="%i2"; # const BN_ULONG *bp,
72 $np="%i3"; # const BN_ULONG *np,
73 $n0="%i4"; # const BN_ULONG *n0,
74 $num="%i5"; # int num);
77 $ap_l="%l1"; # a[num],n[num] are smashed to 32-bit words and saved
78 $ap_h="%l2"; # to these four vectors as double-precision FP values.
79 $np_l="%l3"; # This way a bunch of fxtods are eliminated in second
80 $np_h="%l4"; # loop and L1-cache aliasing is minimized...
83 $mask="%l7"; # 16-bit mask, 0xffff
85 $n0="%g4"; # reassigned!!!
86 $carry="%i4"; # reassigned!!! [only 1 bit is used]
88 # FP register naming chart
103 $ba="%f0"; $bb="%f2"; $bc="%f4"; $bd="%f6";
104 $na="%f8"; $nb="%f10"; $nc="%f12"; $nd="%f14";
105 $alo="%f16"; $alo_="%f17"; $ahi="%f18"; $ahi_="%f19";
106 $nlo="%f20"; $nlo_="%f21"; $nhi="%f22"; $nhi_="%f23";
108 $dota="%f24"; $dotb="%f26";
110 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
111 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
112 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
113 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
115 $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load
118 .ident "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
119 .section ".text",#alloc,#execinstr
125 sethi %hi(0xffff),$mask
126 sll $num,3,$num ! num*=8
127 or $mask,%lo(0xffff),$mask
128 ldx [%i4],$n0 ! reassigned, remember?
130 add %sp,$bias,%o0 ! real top of stack
132 add %o1,$num,%o1 ! %o1=num*5
135 and %o0,-2048,%o0 ! optimize TLB utilization
136 sub %o0,$bias,%sp ! alloca
139 add %sp,$bias+$frame+$locals,$tp
141 add $ap_l,$num,$ap_l ! [an]p_[lh] point at the vector ends !
146 wr %g0,$ASI_FL16_P,%asi ! setup %asi for 16-bit FP loads
148 add $rp,$num,$rp ! readjust input pointers to point
149 add $ap,$num,$ap ! at the ends too...
153 stx %o7,[%sp+$bias+$frame+48]
160 ldx [$bp+$i],%o0 ! bp[0]
162 add %sp,$bias+$frame+0,%o7
163 ldx [$ap+$j],%o1 ! ap[0]
165 mulx %o1,%o0,%o0 ! ap[0]*bp[0]
166 mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
169 ld [%o3+4],$alo_ ! load a[j] as pair of 32-bit words
173 ld [%o5+4],$nlo_ ! load n[j] as pair of 32-bit words
178 ! transfer b[i] to FPU as 4x16-bit values
188 ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
198 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
202 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
213 faddd $aloa,$nloa,$nloa
216 faddd $alob,$nlob,$nlob
219 faddd $aloc,$nloc,$nloc
222 faddd $alod,$nlod,$nlod
225 faddd $ahia,$nhia,$nhia
229 faddd $ahib,$nhib,$nhib
230 faddd $ahic,$nhic,$dota ! $nhic
231 faddd $ahid,$nhid,$dotb ! $nhid
233 faddd $nloc,$nhia,$nloc
234 faddd $nlod,$nhib,$nlod
241 std $nloa,[%sp+$bias+$frame+0]
242 std $nlob,[%sp+$bias+$frame+8]
243 std $nloc,[%sp+$bias+$frame+16]
244 std $nlod,[%sp+$bias+$frame+24]
245 ldx [%sp+$bias+$frame+0],%o0
246 ldx [%sp+$bias+$frame+8],%o1
247 ldx [%sp+$bias+$frame+16],%o2
248 ldx [%sp+$bias+$frame+24],%o3
255 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
264 !or %o7,%o0,%o0 ! 64-bit result
265 srlx %o3,16,%g1 ! 34-bit carry
273 ld [%o3+4],$alo_ ! load a[j] as pair of 32-bit words
277 ld [%o4+4],$nlo_ ! load n[j] as pair of 32-bit words
287 std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
291 std $nlo,[$np_l+$j] ! save smashed np[j] in double format
297 faddd $aloa,$nloa,$nloa
300 faddd $alob,$nlob,$nlob
303 faddd $aloc,$nloc,$nloc
306 faddd $alod,$nlod,$nlod
309 faddd $ahia,$nhia,$nhia
312 faddd $ahib,$nhib,$nhib
314 faddd $dota,$nloa,$nloa
315 faddd $dotb,$nlob,$nlob
316 faddd $ahic,$nhic,$dota ! $nhic
317 faddd $ahid,$nhid,$dotb ! $nhid
319 faddd $nloc,$nhia,$nloc
320 faddd $nlod,$nhib,$nlod
327 std $nloa,[%sp+$bias+$frame+0]
328 std $nlob,[%sp+$bias+$frame+8]
329 std $nloc,[%sp+$bias+$frame+16]
330 std $nlod,[%sp+$bias+$frame+24]
331 ldx [%sp+$bias+$frame+0],%o0
332 ldx [%sp+$bias+$frame+8],%o1
333 ldx [%sp+$bias+$frame+16],%o2
334 ldx [%sp+$bias+$frame+24],%o3
341 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
350 or %o7,%o0,%o0 ! 64-bit result
352 srlx %o3,16,%g1 ! 34-bit carry
356 stx %o0,[$tp] ! tp[j-1]=
363 std $dota,[%sp+$bias+$frame+32]
364 std $dotb,[%sp+$bias+$frame+40]
365 ldx [%sp+$bias+$frame+32],%o0
366 ldx [%sp+$bias+$frame+40],%o1
379 stx %o0,[$tp] ! tp[num-1]=
386 add %sp,$bias+$frame+$locals,$tp
389 ldx [$bp+$i],%o0 ! bp[i]
390 add %sp,$bias+$frame+0,%o7
391 ldx [$ap+$j],%o1 ! ap[0]
393 ldx [$tp],%o2 ! tp[0]
396 mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
400 ! transfer b[i] to FPU as 4x16-bit values
406 ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
415 ldd [$ap_l+$j],$alo ! load a[j] in double format
419 ldd [$np_l+$j],$nlo ! load n[j] in double format
430 faddd $aloa,$nloa,$nloa
433 faddd $alob,$nlob,$nlob
436 faddd $aloc,$nloc,$nloc
439 faddd $alod,$nlod,$nlod
442 faddd $ahia,$nhia,$nhia
446 faddd $ahib,$nhib,$nhib
447 faddd $ahic,$nhic,$dota ! $nhic
448 faddd $ahid,$nhid,$dotb ! $nhid
450 faddd $nloc,$nhia,$nloc
451 faddd $nlod,$nhib,$nlod
458 std $nloa,[%sp+$bias+$frame+0]
459 std $nlob,[%sp+$bias+$frame+8]
460 std $nloc,[%sp+$bias+$frame+16]
461 std $nlod,[%sp+$bias+$frame+24]
462 ldx [%sp+$bias+$frame+0],%o0
463 ldx [%sp+$bias+$frame+8],%o1
464 ldx [%sp+$bias+$frame+16],%o2
465 ldx [%sp+$bias+$frame+24],%o3
472 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
482 or %o7,%o0,%o0 ! 64-bit result
486 srlx %o3,16,%g1 ! 34-bit carry
494 ldd [$ap_l+$j],$alo ! load a[j] in double format
496 ldd [$np_l+$j],$nlo ! load n[j] in double format
505 faddd $aloa,$nloa,$nloa
508 faddd $alob,$nlob,$nlob
511 faddd $aloc,$nloc,$nloc
514 faddd $alod,$nlod,$nlod
517 faddd $ahia,$nhia,$nhia
521 faddd $ahib,$nhib,$nhib
522 faddd $dota,$nloa,$nloa
523 faddd $dotb,$nlob,$nlob
524 faddd $ahic,$nhic,$dota ! $nhic
525 faddd $ahid,$nhid,$dotb ! $nhid
527 faddd $nloc,$nhia,$nloc
528 faddd $nlod,$nhib,$nlod
535 std $nloa,[%sp+$bias+$frame+0]
536 std $nlob,[%sp+$bias+$frame+8]
537 std $nloc,[%sp+$bias+$frame+16]
538 std $nlod,[%sp+$bias+$frame+24]
539 ldx [%sp+$bias+$frame+0],%o0
540 ldx [%sp+$bias+$frame+8],%o1
541 ldx [%sp+$bias+$frame+16],%o2
542 ldx [%sp+$bias+$frame+24],%o3
549 add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
558 or %o7,%o0,%o0 ! 64-bit result
560 srlx %o3,16,%g1 ! 34-bit carry
564 ldx [$tp+8],%o7 ! tp[j]
569 stx %o0,[$tp] ! tp[j-1]
576 std $dota,[%sp+$bias+$frame+32]
577 std $dotb,[%sp+$bias+$frame+40]
578 ldx [%sp+$bias+$frame+32],%o0
579 ldx [%sp+$bias+$frame+40],%o1
592 stx %o0,[$tp] ! tp[num-1]
601 sub %g0,$num,$j ! j=-num
602 add $tp,8,$tp ! adjust tp to point at the end
604 cmp $carry,0 ! clears %icc.c
611 bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
624 subccc $carry,0,$carry
649 ldx [%sp+$bias+$frame+48],%o7
650 wr %g0,%o7,%asi ! restore %asi
655 .type $fname,#function
656 .size $fname,(.-$fname)
659 $code =~ s/\`([^\`]*)\`/eval($1)/gem;