2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # The reason for undertaken effort is basically following. Even though
20 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
21 # performance was observed to be less than impressive, essentially as
22 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
23 # Well, it's not surprising that IBM had to make some sacrifices to
24 # boost the clock frequency that much, but no overall improvement?
25 # Having observed how much difference did switching to FPU make on
26 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
27 # Unfortunately the resulting performance improvement is not as
28 # impressive, ~30%, and in absolute terms is still very far from what
29 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
30 # something wrong, but in the lack of assembler level micro-profiling
31 # data or at least decent platform guide I can't tell... Or better
32 # results might be achieved with VMX... Anyway, this module provides
33 # *worse* performance on other PowerPC implementations, ~40-15% slower
34 # on PPC970 depending on key length and ~40% slower on Power 5 for all
35 # key lengths. As it's obviously inappropriate as "best all-round"
36 # alternative, it has to be complemented with run-time CPU family
37 # detection. Oh! It should also be noted that unlike other PowerPC
38 # implementation IALU ppc-mont.pl module performs *suboptimaly* on
39 # >=1024-bit key lengths on Power 6. It should also be noted that
40 # *everything* said so far applies to 64-bit builds! As far as 32-bit
41 # application executed on 64-bit CPU goes, this module is likely to
42 # become preferred choice, because it's easy to adapt it for such
43 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
47 # Micro-profiling assisted optimization results in ~15% improvement
48 # over original ppc64-mont.pl version, or overall ~50% improvement
49 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
50 # Power 6 CPU, this module is 5-150% faster depending on key length,
51 # [hereafter] more for longer keys. But if compared to ppc-mont.pl
52 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
53 # in absolute terms, but it's apparently the way Power 6 is...
57 # Adapted for 32-bit build this module delivers 25-120%, yes, more
58 # than *twice* for longer keys, performance improvement over 32-bit
59 # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
60 # even 64-bit integer operations and the trouble is that most PPC
61 # operating systems don't preserve upper halves of general purpose
62 # registers upon 32-bit signal delivery. They do preserve them upon
63 # context switch, but not signalling:-( This means that asynchronous
64 # signals have to be blocked upon entry to this subroutine. Signal
65 # masking (and of course complementary unmasking) has quite an impact
66 # on performance, naturally larger for shorter keys. It's so severe
67 # that 512-bit key performance can be as low as 1/3 of expected one.
68 # This is why this routine can be engaged for longer key operations
69 # only on these OSes, see crypto/ppccap.c for further details. MacOS X
70 # is an exception from this and doesn't require signal masking, and
71 # that's where above improvement coefficients were collected. For
72 # others alternative would be to break dependence on upper halves of
73 # GPRs by sticking to 32-bit integer operations...
77 # Remove above mentioned dependence on GPRs' upper halves in 32-bit
78 # build. No signal masking overhead, but integer instructions are
79 # *more* numerous... It's still "universally" faster than 32-bit
80 # ppc-mont.pl, but improvement coefficient is not as impressive
85 if ($flavour =~ /32/) {
88 $fname= "bn_mul_mont_fpu64";
90 $STUX= "stwux"; # store indexed and update
93 } elsif ($flavour =~ /64/) {
96 $fname= "bn_mul_mont_fpu64";
98 # same as above, but 64-bit mnemonics...
99 $STUX= "stdux"; # store indexed and update
102 } else { die "nonsense $flavour"; }
104 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
106 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
107 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
108 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
109 die "can't locate ppc-xlate.pl";
111 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
113 $FRAME=64; # padded frame header
125 $rp="r9"; # $rp is reassigned
129 # non-volatile registers
133 $nap_d="r22"; # interleaved ap and np in double format
135 $t0="r24"; # temporary registers
144 # PPC offers enough register bank capacity to unroll inner loops twice
168 $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
169 $na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
170 $dota="f8"; $dotb="f9";
171 $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
172 $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
173 $T0a="f24"; $T0b="f25";
174 $T1a="f26"; $T1b="f27";
175 $T2a="f28"; $T2b="f29";
176 $T3a="f30"; $T3b="f31";
178 # sp----------->+-------------------------------+
180 # +-------------------------------+
182 # +64 +-------------------------------+
183 # | 16 gpr<->fpr transfer zone |
186 # +16*8 +-------------------------------+
187 # | __int64 tmp[-1] |
188 # +-------------------------------+
189 # | __int64 tmp[num] |
193 # +(num+1)*8 +-------------------------------+
194 # | padding to 64 byte boundary |
196 # +X +-------------------------------+
197 # | double nap_d[4*num] |
201 # +-------------------------------+
203 # -13*size_t +-------------------------------+
204 # | 13 saved gpr, r19-r31 |
207 # -12*8 +-------------------------------+
208 # | 12 saved fpr, f20-f31 |
211 # +-------------------------------+
220 cmpwi $num,`3*8/$SIZE_T`
221 mr $rp,r3 ; $rp is reassigned
222 li r3,0 ; possible "not handled" return code
224 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
227 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
229 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
230 add $tp,$tp,$num ; place for tp[num+1]
231 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
232 subf $tp,$tp,$sp ; $sp-$tp
233 and $tp,$tp,$i ; minimize TLB usage
234 subf $tp,$sp,$tp ; $tp-$sp
236 $STUX $sp,$sp,$tp ; alloca
238 $PUSH r19,`-12*8-13*$SIZE_T`($i)
239 $PUSH r20,`-12*8-12*$SIZE_T`($i)
240 $PUSH r21,`-12*8-11*$SIZE_T`($i)
241 $PUSH r22,`-12*8-10*$SIZE_T`($i)
242 $PUSH r23,`-12*8-9*$SIZE_T`($i)
243 $PUSH r24,`-12*8-8*$SIZE_T`($i)
244 $PUSH r25,`-12*8-7*$SIZE_T`($i)
245 $PUSH r26,`-12*8-6*$SIZE_T`($i)
246 $PUSH r27,`-12*8-5*$SIZE_T`($i)
247 $PUSH r28,`-12*8-4*$SIZE_T`($i)
248 $PUSH r29,`-12*8-3*$SIZE_T`($i)
249 $PUSH r30,`-12*8-2*$SIZE_T`($i)
250 $PUSH r31,`-12*8-1*$SIZE_T`($i)
264 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
267 and $nap_d,$nap_d,$i ; align to 64 bytes
268 ; nap_d is off by 1, because it's used with stfdu/lfdu
269 addi $nap_d,$nap_d,-8
270 srwi $j,$num,`3+1` ; counter register, num/2
272 addi $tp,$sp,`$FRAME+$TRANSFER-8`
277 $code.=<<___ if ($SIZE_T==8);
278 ld $a0,0($ap) ; pull ap[0] value
279 ld $t3,0($bp) ; bp[0]
280 ld $n0,0($n0) ; pull n0[0] value
282 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
283 ; transfer bp[0] to FPU as 4x16-bit values
288 std $t0,`$FRAME+0`($sp)
289 std $t1,`$FRAME+8`($sp)
290 std $t2,`$FRAME+16`($sp)
291 std $t3,`$FRAME+24`($sp)
293 mulld $t7,$t7,$n0 ; tp[0]*n0
294 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
299 std $t4,`$FRAME+32`($sp)
300 std $t5,`$FRAME+40`($sp)
301 std $t6,`$FRAME+48`($sp)
302 std $t7,`$FRAME+56`($sp)
304 extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
305 extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
306 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
307 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
308 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
309 lwz $t5,`0^$LITTLE_ENDIAN`($np)
310 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
311 lwz $t7,`8^$LITTLE_ENDIAN`($np)
313 $code.=<<___ if ($SIZE_T==4);
314 lwz $a0,0($ap) ; pull ap[0,1] value
318 lwz $t1,0($bp) ; bp[0,1]
320 lwz $n0,0($n1) ; pull n0[0,1] value
323 mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
329 ; transfer bp[0] to FPU as 4x16-bit values
334 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
335 std $t1,`$FRAME+8`($sp)
336 std $t2,`$FRAME+16`($sp)
337 std $t3,`$FRAME+24`($sp)
339 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
345 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
350 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
351 std $t5,`$FRAME+40`($sp)
352 std $t6,`$FRAME+48`($sp)
353 std $t7,`$FRAME+56`($sp)
355 mr $t0,$a0 ; lwz $t0,0($ap)
356 mr $t1,$a1 ; lwz $t1,4($ap)
357 lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
359 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
365 lfd $ba,`$FRAME+0`($sp)
366 lfd $bb,`$FRAME+8`($sp)
367 lfd $bc,`$FRAME+16`($sp)
368 lfd $bd,`$FRAME+24`($sp)
369 lfd $na,`$FRAME+32`($sp)
370 lfd $nb,`$FRAME+40`($sp)
371 lfd $nc,`$FRAME+48`($sp)
372 lfd $nd,`$FRAME+56`($sp)
373 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
374 std $t1,`$FRAME+72`($sp)
375 std $t2,`$FRAME+80`($sp)
376 std $t3,`$FRAME+88`($sp)
377 std $t4,`$FRAME+96`($sp)
378 std $t5,`$FRAME+104`($sp)
379 std $t6,`$FRAME+112`($sp)
380 std $t7,`$FRAME+120`($sp)
390 lfd $A0,`$FRAME+64`($sp)
391 lfd $A1,`$FRAME+72`($sp)
392 lfd $A2,`$FRAME+80`($sp)
393 lfd $A3,`$FRAME+88`($sp)
394 lfd $N0,`$FRAME+96`($sp)
395 lfd $N1,`$FRAME+104`($sp)
396 lfd $N2,`$FRAME+112`($sp)
397 lfd $N3,`$FRAME+120`($sp)
411 stfd $A0,8($nap_d) ; save a[j] in double format
415 stfd $A2,24($nap_d) ; save a[j+1] in double format
419 stfd $N0,40($nap_d) ; save n[j] in double format
423 stfd $N2,56($nap_d) ; save n[j+1] in double format
426 fmadd $T1a,$A0,$bc,$T1a
427 fmadd $T1b,$A0,$bd,$T1b
428 fmadd $T2a,$A1,$bc,$T2a
429 fmadd $T2b,$A1,$bd,$T2b
430 fmadd $T3a,$A2,$bc,$T3a
431 fmadd $T3b,$A2,$bd,$T3b
435 fmadd $T1a,$N1,$na,$T1a
436 fmadd $T1b,$N1,$nb,$T1b
437 fmadd $T2a,$N2,$na,$T2a
438 fmadd $T2b,$N2,$nb,$T2b
439 fmadd $T3a,$N3,$na,$T3a
440 fmadd $T3b,$N3,$nb,$T3b
441 fmadd $T0a,$N0,$na,$T0a
442 fmadd $T0b,$N0,$nb,$T0b
444 fmadd $T1a,$N0,$nc,$T1a
445 fmadd $T1b,$N0,$nd,$T1b
446 fmadd $T2a,$N1,$nc,$T2a
447 fmadd $T2b,$N1,$nd,$T2b
448 fmadd $T3a,$N2,$nc,$T3a
449 fmadd $T3b,$N2,$nd,$T3b
450 fmadd $dota,$N3,$nc,$dota
451 fmadd $dotb,$N3,$nd,$dotb
462 stfd $T0a,`$FRAME+0`($sp)
463 stfd $T0b,`$FRAME+8`($sp)
464 stfd $T1a,`$FRAME+16`($sp)
465 stfd $T1b,`$FRAME+24`($sp)
466 stfd $T2a,`$FRAME+32`($sp)
467 stfd $T2b,`$FRAME+40`($sp)
468 stfd $T3a,`$FRAME+48`($sp)
469 stfd $T3b,`$FRAME+56`($sp)
474 $code.=<<___ if ($SIZE_T==8);
475 lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
476 lwz $t1,`0^$LITTLE_ENDIAN`($ap)
477 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
478 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
479 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
480 lwz $t5,`0^$LITTLE_ENDIAN`($np)
481 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
482 lwz $t7,`8^$LITTLE_ENDIAN`($np)
484 $code.=<<___ if ($SIZE_T==4);
485 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
489 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
495 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
496 std $t1,`$FRAME+72`($sp)
497 std $t2,`$FRAME+80`($sp)
498 std $t3,`$FRAME+88`($sp)
499 std $t4,`$FRAME+96`($sp)
500 std $t5,`$FRAME+104`($sp)
501 std $t6,`$FRAME+112`($sp)
502 std $t7,`$FRAME+120`($sp)
504 if ($SIZE_T==8 or $flavour =~ /osx/) {
506 ld $t0,`$FRAME+0`($sp)
507 ld $t1,`$FRAME+8`($sp)
508 ld $t2,`$FRAME+16`($sp)
509 ld $t3,`$FRAME+24`($sp)
510 ld $t4,`$FRAME+32`($sp)
511 ld $t5,`$FRAME+40`($sp)
512 ld $t6,`$FRAME+48`($sp)
513 ld $t7,`$FRAME+56`($sp)
517 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
518 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
519 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
520 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
521 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
522 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
523 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
524 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
528 lfd $A0,`$FRAME+64`($sp)
529 lfd $A1,`$FRAME+72`($sp)
530 lfd $A2,`$FRAME+80`($sp)
531 lfd $A3,`$FRAME+88`($sp)
532 lfd $N0,`$FRAME+96`($sp)
533 lfd $N1,`$FRAME+104`($sp)
534 lfd $N2,`$FRAME+112`($sp)
535 lfd $N3,`$FRAME+120`($sp)
551 stfd $A0,8($nap_d) ; save a[j] in double format
555 fmadd $T0a,$A0,$ba,$dota
556 fmadd $T0b,$A0,$bb,$dotb
557 stfd $A2,24($nap_d) ; save a[j+1] in double format
560 if ($SIZE_T==8 or $flavour =~ /osx/) {
562 fmadd $T1a,$A0,$bc,$T1a
563 fmadd $T1b,$A0,$bd,$T1b
564 fmadd $T2a,$A1,$bc,$T2a
565 fmadd $T2b,$A1,$bd,$T2b
566 stfd $N0,40($nap_d) ; save n[j] in double format
568 fmadd $T3a,$A2,$bc,$T3a
569 fmadd $T3b,$A2,$bd,$T3b
570 add $t0,$t0,$carry ; can not overflow
573 stfd $N2,56($nap_d) ; save n[j+1] in double format
579 fmadd $T1a,$N1,$na,$T1a
580 fmadd $T1b,$N1,$nb,$T1b
582 fmadd $T2a,$N2,$na,$T2a
583 fmadd $T2b,$N2,$nb,$T2b
585 fmadd $T3a,$N3,$na,$T3a
586 fmadd $T3b,$N3,$nb,$T3b
588 fmadd $T0a,$N0,$na,$T0a
589 fmadd $T0b,$N0,$nb,$T0b
594 fmadd $T1a,$N0,$nc,$T1a
595 fmadd $T1b,$N0,$nd,$T1b
596 insrdi $t0,$t3,16,0 ; 0..63 bits
597 fmadd $T2a,$N1,$nc,$T2a
598 fmadd $T2b,$N1,$nd,$T2b
600 fmadd $T3a,$N2,$nc,$T3a
601 fmadd $T3b,$N2,$nd,$T3b
603 fmadd $dota,$N3,$nc,$dota
604 fmadd $dotb,$N3,$nd,$dotb
621 insrdi $t4,$t7,16,0 ; 64..127 bits
622 srdi $carry,$t7,16 ; upper 33 bits
624 stfd $T0a,`$FRAME+0`($sp)
625 stfd $T0b,`$FRAME+8`($sp)
626 stfd $T1a,`$FRAME+16`($sp)
627 stfd $T1b,`$FRAME+24`($sp)
628 stfd $T2a,`$FRAME+32`($sp)
629 stfd $T2b,`$FRAME+40`($sp)
630 stfd $T3a,`$FRAME+48`($sp)
631 stfd $T3b,`$FRAME+56`($sp)
632 std $t0,8($tp) ; tp[j-1]
633 stdu $t4,16($tp) ; tp[j]
637 fmadd $T1a,$A0,$bc,$T1a
638 fmadd $T1b,$A0,$bd,$T1b
642 fmadd $T2a,$A1,$bc,$T2a
643 fmadd $T2b,$A1,$bd,$T2b
644 stfd $N0,40($nap_d) ; save n[j] in double format
647 insrwi $carry,$t1,16,0
648 fmadd $T3a,$A2,$bc,$T3a
649 fmadd $T3b,$A2,$bd,$T3b
655 stfd $N2,56($nap_d) ; save n[j+1] in double format
657 insrwi $t0,$t2,16,0 ; 0..31 bits
659 insrwi $carry,$t3,16,0
661 fmadd $T1a,$N1,$na,$T1a
662 fmadd $T1b,$N1,$nb,$T1b
663 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
664 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
668 fmadd $T2a,$N2,$na,$T2a
669 fmadd $T2b,$N2,$nb,$T2b
671 insrwi $carry,$t5,16,0
672 fmadd $T3a,$N3,$na,$T3a
673 fmadd $T3b,$N3,$nb,$T3b
677 fmadd $T0a,$N0,$na,$T0a
678 fmadd $T0b,$N0,$nb,$T0b
679 insrwi $t4,$t6,16,0 ; 32..63 bits
681 insrwi $carry,$t7,16,0
683 fmadd $T1a,$N0,$nc,$T1a
684 fmadd $T1b,$N0,$nd,$T1b
685 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
686 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
690 fmadd $T2a,$N1,$nc,$T2a
691 fmadd $T2b,$N1,$nd,$T2b
692 stw $t0,12($tp) ; tp[j-1]
695 insrwi $carry,$t3,16,0
696 fmadd $T3a,$N2,$nc,$T3a
697 fmadd $T3b,$N2,$nd,$T3b
698 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
699 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
703 fmadd $dota,$N3,$nc,$dota
704 fmadd $dotb,$N3,$nd,$dotb
705 insrwi $t2,$t6,16,0 ; 64..95 bits
707 insrwi $carry,$t7,16,0
711 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
712 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
719 insrwi $carry,$t1,16,0
727 insrwi $t0,$t4,16,0 ; 96..127 bits
729 insrwi $carry,$t5,16,0
731 stfd $T0a,`$FRAME+0`($sp)
732 stfd $T0b,`$FRAME+8`($sp)
733 stfd $T1a,`$FRAME+16`($sp)
734 stfd $T1b,`$FRAME+24`($sp)
735 stfd $T2a,`$FRAME+32`($sp)
736 stfd $T2b,`$FRAME+40`($sp)
737 stfd $T3a,`$FRAME+48`($sp)
738 stfd $T3b,`$FRAME+56`($sp)
739 stw $t2,20($tp) ; tp[j]
749 if ($SIZE_T==8 or $flavour =~ /osx/) {
751 ld $t0,`$FRAME+0`($sp)
752 ld $t1,`$FRAME+8`($sp)
753 ld $t2,`$FRAME+16`($sp)
754 ld $t3,`$FRAME+24`($sp)
755 ld $t4,`$FRAME+32`($sp)
756 ld $t5,`$FRAME+40`($sp)
757 ld $t6,`$FRAME+48`($sp)
758 ld $t7,`$FRAME+56`($sp)
759 stfd $dota,`$FRAME+64`($sp)
760 stfd $dotb,`$FRAME+72`($sp)
762 add $t0,$t0,$carry ; can not overflow
772 insrdi $t0,$t3,16,0 ; 0..63 bits
782 insrdi $t4,$t7,16,0 ; 64..127 bits
783 srdi $carry,$t7,16 ; upper 33 bits
784 ld $t6,`$FRAME+64`($sp)
785 ld $t7,`$FRAME+72`($sp)
787 std $t0,8($tp) ; tp[j-1]
788 stdu $t4,16($tp) ; tp[j]
790 add $t6,$t6,$carry ; can not overflow
795 std $t6,8($tp) ; tp[num-1]
799 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
800 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
801 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
802 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
803 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
804 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
805 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
806 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
807 stfd $dota,`$FRAME+64`($sp)
808 stfd $dotb,`$FRAME+72`($sp)
813 insrwi $carry,$t1,16,0
818 insrwi $t0,$t2,16,0 ; 0..31 bits
819 insrwi $carry,$t3,16,0
824 insrwi $carry,$t5,16,0
829 insrwi $t4,$t6,16,0 ; 32..63 bits
830 insrwi $carry,$t7,16,0
832 stw $t0,12($tp) ; tp[j-1]
835 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
836 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
837 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
838 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
839 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
840 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
841 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
842 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
847 insrwi $carry,$t3,16,0
852 insrwi $t2,$t6,16,0 ; 64..95 bits
853 insrwi $carry,$t7,16,0
858 insrwi $carry,$t1,16,0
863 insrwi $t0,$t4,16,0 ; 96..127 bits
864 insrwi $carry,$t5,16,0
866 stw $t2,20($tp) ; tp[j]
869 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
870 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
871 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
872 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
877 insrwi $carry,$t7,16,0
886 stw $t6,12($tp) ; tp[num-1]
892 subf $nap_d,$t7,$nap_d ; rewind pointer
897 addi $tp,$sp,`$FRAME+$TRANSFER`
901 $code.=<<___ if ($SIZE_T==8);
902 ldx $t3,$bp,$i ; bp[i]
904 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
905 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
906 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
907 ; transfer bp[i] to FPU as 4x16-bit values
912 std $t0,`$FRAME+0`($sp)
913 std $t1,`$FRAME+8`($sp)
914 std $t2,`$FRAME+16`($sp)
915 std $t3,`$FRAME+24`($sp)
917 mulld $t7,$t7,$n0 ; tp[0]*n0
918 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
923 std $t4,`$FRAME+32`($sp)
924 std $t5,`$FRAME+40`($sp)
925 std $t6,`$FRAME+48`($sp)
926 std $t7,`$FRAME+56`($sp)
928 $code.=<<___ if ($SIZE_T==4);
931 lwz $t1,0($t0) ; bp[i,i+1]
934 mullw $t4,$a0,$t1 ; ap[0]*bp[i]
935 lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
937 lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
942 addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
944 ; transfer bp[i] to FPU as 4x16-bit values
949 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
950 std $t1,`$FRAME+8`($sp)
951 std $t2,`$FRAME+16`($sp)
952 std $t3,`$FRAME+24`($sp)
954 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
960 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
965 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
966 std $t5,`$FRAME+40`($sp)
967 std $t6,`$FRAME+48`($sp)
968 std $t7,`$FRAME+56`($sp)
971 lfd $A0,8($nap_d) ; load a[j] in double format
973 lfd $A2,24($nap_d) ; load a[j+1] in double format
975 lfd $N0,40($nap_d) ; load n[j] in double format
977 lfd $N2,56($nap_d) ; load n[j+1] in double format
980 lfd $ba,`$FRAME+0`($sp)
981 lfd $bb,`$FRAME+8`($sp)
982 lfd $bc,`$FRAME+16`($sp)
983 lfd $bd,`$FRAME+24`($sp)
984 lfd $na,`$FRAME+32`($sp)
985 lfd $nb,`$FRAME+40`($sp)
986 lfd $nc,`$FRAME+48`($sp)
987 lfd $nd,`$FRAME+56`($sp)
1007 fmadd $T1a,$A0,$bc,$T1a
1008 fmadd $T1b,$A0,$bd,$T1b
1009 fmadd $T2a,$A1,$bc,$T2a
1010 fmadd $T2b,$A1,$bd,$T2b
1011 fmadd $T3a,$A2,$bc,$T3a
1012 fmadd $T3b,$A2,$bd,$T3b
1016 fmadd $T1a,$N1,$na,$T1a
1017 fmadd $T1b,$N1,$nb,$T1b
1018 lfd $A0,8($nap_d) ; load a[j] in double format
1020 fmadd $T2a,$N2,$na,$T2a
1021 fmadd $T2b,$N2,$nb,$T2b
1022 lfd $A2,24($nap_d) ; load a[j+1] in double format
1024 fmadd $T3a,$N3,$na,$T3a
1025 fmadd $T3b,$N3,$nb,$T3b
1026 fmadd $T0a,$N0,$na,$T0a
1027 fmadd $T0b,$N0,$nb,$T0b
1029 fmadd $T1a,$N0,$nc,$T1a
1030 fmadd $T1b,$N0,$nd,$T1b
1031 fmadd $T2a,$N1,$nc,$T2a
1032 fmadd $T2b,$N1,$nd,$T2b
1033 fmadd $T3a,$N2,$nc,$T3a
1034 fmadd $T3b,$N2,$nd,$T3b
1035 fmadd $dota,$N3,$nc,$dota
1036 fmadd $dotb,$N3,$nd,$dotb
1047 stfd $T0a,`$FRAME+0`($sp)
1048 stfd $T0b,`$FRAME+8`($sp)
1049 stfd $T1a,`$FRAME+16`($sp)
1050 stfd $T1b,`$FRAME+24`($sp)
1051 stfd $T2a,`$FRAME+32`($sp)
1052 stfd $T2b,`$FRAME+40`($sp)
1053 stfd $T3a,`$FRAME+48`($sp)
1054 stfd $T3b,`$FRAME+56`($sp)
1062 lfd $N0,40($nap_d) ; load n[j] in double format
1066 fmadd $T0a,$A0,$ba,$dota
1067 fmadd $T0b,$A0,$bb,$dotb
1068 lfd $N2,56($nap_d) ; load n[j+1] in double format
1071 fmadd $T1a,$A0,$bc,$T1a
1072 fmadd $T1b,$A0,$bd,$T1b
1073 fmadd $T2a,$A1,$bc,$T2a
1074 fmadd $T2b,$A1,$bd,$T2b
1075 lfd $A0,8($nap_d) ; load a[j] in double format
1077 fmadd $T3a,$A2,$bc,$T3a
1078 fmadd $T3b,$A2,$bd,$T3b
1081 lfd $A2,24($nap_d) ; load a[j+1] in double format
1084 if ($SIZE_T==8 or $flavour =~ /osx/) {
1086 fmadd $T1a,$N1,$na,$T1a
1087 fmadd $T1b,$N1,$nb,$T1b
1088 ld $t0,`$FRAME+0`($sp)
1089 ld $t1,`$FRAME+8`($sp)
1090 fmadd $T2a,$N2,$na,$T2a
1091 fmadd $T2b,$N2,$nb,$T2b
1092 ld $t2,`$FRAME+16`($sp)
1093 ld $t3,`$FRAME+24`($sp)
1094 fmadd $T3a,$N3,$na,$T3a
1095 fmadd $T3b,$N3,$nb,$T3b
1096 add $t0,$t0,$carry ; can not overflow
1097 ld $t4,`$FRAME+32`($sp)
1098 ld $t5,`$FRAME+40`($sp)
1099 fmadd $T0a,$N0,$na,$T0a
1100 fmadd $T0b,$N0,$nb,$T0b
1104 ld $t6,`$FRAME+48`($sp)
1105 ld $t7,`$FRAME+56`($sp)
1107 fmadd $T1a,$N0,$nc,$T1a
1108 fmadd $T1b,$N0,$nd,$T1b
1109 insrdi $t0,$t1,16,32
1110 ld $t1,8($tp) ; tp[j]
1111 fmadd $T2a,$N1,$nc,$T2a
1112 fmadd $T2b,$N1,$nd,$T2b
1114 fmadd $T3a,$N2,$nc,$T3a
1115 fmadd $T3b,$N2,$nd,$T3b
1117 insrdi $t0,$t2,16,16
1118 fmadd $dota,$N3,$nc,$dota
1119 fmadd $dotb,$N3,$nd,$dotb
1121 ldu $t2,16($tp) ; tp[j+1]
1123 insrdi $t0,$t3,16,0 ; 0..63 bits
1135 insrdi $t4,$t5,16,32
1140 insrdi $t4,$t6,16,16
1142 stfd $T0a,`$FRAME+0`($sp)
1143 stfd $T0b,`$FRAME+8`($sp)
1147 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1153 stfd $T1a,`$FRAME+16`($sp)
1154 stfd $T1b,`$FRAME+24`($sp)
1155 insrdi $t4,$t7,16,0 ; 64..127 bits
1156 srdi $carry,$t7,16 ; upper 33 bits
1157 stfd $T2a,`$FRAME+32`($sp)
1158 stfd $T2b,`$FRAME+40`($sp)
1161 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1167 stfd $T3a,`$FRAME+48`($sp)
1168 stfd $T3b,`$FRAME+56`($sp)
1170 std $t3,-16($tp) ; tp[j-1]
1171 std $t5,-8($tp) ; tp[j]
1175 fmadd $T1a,$N1,$na,$T1a
1176 fmadd $T1b,$N1,$nb,$T1b
1177 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1178 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1179 fmadd $T2a,$N2,$na,$T2a
1180 fmadd $T2b,$N2,$nb,$T2b
1181 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1182 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1183 fmadd $T3a,$N3,$na,$T3a
1184 fmadd $T3b,$N3,$nb,$T3b
1185 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1186 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1190 fmadd $T0a,$N0,$na,$T0a
1191 fmadd $T0b,$N0,$nb,$T0b
1192 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1193 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1195 insrwi $carry,$t1,16,0
1197 fmadd $T1a,$N0,$nc,$T1a
1198 fmadd $T1b,$N0,$nd,$T1b
1202 fmadd $T2a,$N1,$nc,$T2a
1203 fmadd $T2b,$N1,$nd,$T2b
1204 insrwi $t0,$t2,16,0 ; 0..31 bits
1206 insrwi $carry,$t3,16,0
1207 fmadd $T3a,$N2,$nc,$T3a
1208 fmadd $T3b,$N2,$nd,$T3b
1209 lwz $t2,12($tp) ; tp[j]
1214 fmadd $dota,$N3,$nc,$dota
1215 fmadd $dotb,$N3,$nd,$dotb
1217 insrwi $carry,$t5,16,0
1224 insrwi $t4,$t6,16,0 ; 32..63 bits
1226 insrwi $carry,$t7,16,0
1230 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1231 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1235 stw $t0,4($tp) ; tp[j-1]
1241 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1242 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1245 insrwi $carry,$t3,16,0
1246 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1247 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1252 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1253 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1256 insrwi $t2,$t6,16,0 ; 64..95 bits
1257 insrwi $carry,$t7,16,0
1262 stfd $T0a,`$FRAME+0`($sp)
1265 stfd $T0b,`$FRAME+8`($sp)
1266 insrwi $carry,$t1,16,0
1269 stfd $T1a,`$FRAME+16`($sp)
1272 insrwi $t0,$t4,16,0 ; 96..127 bits
1273 stfd $T1b,`$FRAME+24`($sp)
1274 insrwi $carry,$t5,16,0
1278 stfd $T2a,`$FRAME+32`($sp)
1280 stfd $T2b,`$FRAME+40`($sp)
1282 stfd $T3a,`$FRAME+48`($sp)
1284 stfd $T3b,`$FRAME+56`($sp)
1285 stw $t2,-4($tp) ; tp[j]
1295 if ($SIZE_T==8 or $flavour =~ /osx/) {
1297 ld $t0,`$FRAME+0`($sp)
1298 ld $t1,`$FRAME+8`($sp)
1299 ld $t2,`$FRAME+16`($sp)
1300 ld $t3,`$FRAME+24`($sp)
1301 ld $t4,`$FRAME+32`($sp)
1302 ld $t5,`$FRAME+40`($sp)
1303 ld $t6,`$FRAME+48`($sp)
1304 ld $t7,`$FRAME+56`($sp)
1305 stfd $dota,`$FRAME+64`($sp)
1306 stfd $dotb,`$FRAME+72`($sp)
1308 add $t0,$t0,$carry ; can not overflow
1312 insrdi $t0,$t1,16,32
1314 ld $t1,8($tp) ; tp[j]
1316 insrdi $t0,$t2,16,16
1318 ldu $t2,16($tp) ; tp[j+1]
1320 insrdi $t0,$t3,16,0 ; 0..63 bits
1325 insrdi $t4,$t5,16,32
1328 insrdi $t4,$t6,16,16
1330 insrdi $t4,$t7,16,0 ; 64..127 bits
1331 srdi $carry,$t7,16 ; upper 33 bits
1332 ld $t6,`$FRAME+64`($sp)
1333 ld $t7,`$FRAME+72`($sp)
1337 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1345 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1353 std $t3,-16($tp) ; tp[j-1]
1354 std $t5,-8($tp) ; tp[j]
1356 add $carry,$carry,$ovf ; comsume upmost overflow
1357 add $t6,$t6,$carry ; can not overflow
1362 std $t6,0($tp) ; tp[num-1]
1366 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1367 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1368 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1369 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1370 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1371 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1372 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1373 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1374 stfd $dota,`$FRAME+64`($sp)
1375 stfd $dotb,`$FRAME+72`($sp)
1380 insrwi $carry,$t1,16,0
1385 insrwi $t0,$t2,16,0 ; 0..31 bits
1386 lwz $t2,12($tp) ; tp[j]
1387 insrwi $carry,$t3,16,0
1393 insrwi $carry,$t5,16,0
1398 insrwi $t4,$t6,16,0 ; 32..63 bits
1399 insrwi $carry,$t7,16,0
1406 stw $t0,4($tp) ; tp[j-1]
1409 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1410 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1411 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1412 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1413 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1414 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1415 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1416 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1421 insrwi $carry,$t3,16,0
1426 insrwi $t2,$t6,16,0 ; 64..95 bits
1428 insrwi $carry,$t7,16,0
1434 insrwi $carry,$t1,16,0
1439 insrwi $t0,$t4,16,0 ; 96..127 bits
1440 insrwi $carry,$t5,16,0
1445 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
1446 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
1449 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
1450 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
1454 stw $t2,-4($tp) ; tp[j]
1459 insrwi $carry,$t7,16,0
1468 stw $t6,4($tp) ; tp[num-1]
1475 subf $nap_d,$t7,$nap_d ; rewind pointer
1480 $code.=<<___ if ($SIZE_T==8);
1481 subf $np,$num,$np ; rewind np
1482 addi $j,$j,1 ; restore counter
1483 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1484 addi $tp,$sp,`$FRAME+$TRANSFER+8`
1485 addi $t4,$sp,`$FRAME+$TRANSFER+16`
1491 Lsub: ldx $t0,$tp,$i
1495 subfe $t0,$t1,$t0 ; tp[j]-np[j]
1496 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
1503 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1506 or $ap,$ap,$np ; ap=borrow?tp:rp
1511 Lcopy: ; copy or in-place refresh
1514 std $i,8($nap_d) ; zap nap_d
1524 stdx $i,$tp,$i ; zap tp at once
1529 $code.=<<___ if ($SIZE_T==4);
1530 subf $np,$num,$np ; rewind np
1531 addi $j,$j,1 ; restore counter
1532 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1533 addi $tp,$sp,`$FRAME+$TRANSFER`
1536 addi $ap,$sp,`$FRAME+$TRANSFER+4`
1540 Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
1544 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
1548 subfe $t4,$t4,$t0 ; tp[j]-np[j]
1549 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
1550 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
1552 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
1554 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
1563 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1564 addi $tp,$sp,`$FRAME+$TRANSFER+4`
1565 subf $rp,$num,$rp ; rewind rp
1568 or $ap,$ap,$np ; ap=borrow?tp:rp
1569 addi $tp,$sp,`$FRAME+$TRANSFER`
1573 Lcopy: ; copy or in-place refresh
1578 std $i,8($nap_d) ; zap nap_d
1590 std $i,8($tp) ; zap tp at once
1597 li r3,1 ; signal "handled"
1598 $POP r19,`-12*8-13*$SIZE_T`($i)
1599 $POP r20,`-12*8-12*$SIZE_T`($i)
1600 $POP r21,`-12*8-11*$SIZE_T`($i)
1601 $POP r22,`-12*8-10*$SIZE_T`($i)
1602 $POP r23,`-12*8-9*$SIZE_T`($i)
1603 $POP r24,`-12*8-8*$SIZE_T`($i)
1604 $POP r25,`-12*8-7*$SIZE_T`($i)
1605 $POP r26,`-12*8-6*$SIZE_T`($i)
1606 $POP r27,`-12*8-5*$SIZE_T`($i)
1607 $POP r28,`-12*8-4*$SIZE_T`($i)
1608 $POP r29,`-12*8-3*$SIZE_T`($i)
1609 $POP r30,`-12*8-2*$SIZE_T`($i)
1610 $POP r31,`-12*8-1*$SIZE_T`($i)
1626 .byte 0,12,4,0,0x8c,13,6,0
1628 .size .$fname,.-.$fname
1630 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1633 $code =~ s/\`([^\`]*)\`/eval $1/gem;