2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # The reason for undertaken effort is basically following. Even though
20 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
21 # performance was observed to be less than impressive, essentially as
22 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
23 # Well, it's not surprising that IBM had to make some sacrifices to
24 # boost the clock frequency that much, but no overall improvement?
25 # Having observed how much difference did switching to FPU make on
26 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
27 # Unfortunately the resulting performance improvement is not as
28 # impressive, ~30%, and in absolute terms is still very far from what
29 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
30 # something wrong, but in the lack of assembler level micro-profiling
31 # data or at least decent platform guide I can't tell... Or better
32 # results might be achieved with VMX... Anyway, this module provides
33 # *worse* performance on other PowerPC implementations, ~40-15% slower
34 # on PPC970 depending on key length and ~40% slower on Power 5 for all
35 # key lengths. As it's obviously inappropriate as "best all-round"
36 # alternative, it has to be complemented with run-time CPU family
37 # detection. Oh! It should also be noted that unlike other PowerPC
38 # implementation IALU ppc-mont.pl module performs *suboptimally* on
39 # >=1024-bit key lengths on Power 6. It should also be noted that
40 # *everything* said so far applies to 64-bit builds! As far as 32-bit
41 # application executed on 64-bit CPU goes, this module is likely to
42 # become preferred choice, because it's easy to adapt it for such
43 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
47 # Micro-profiling assisted optimization results in ~15% improvement
48 # over original ppc64-mont.pl version, or overall ~50% improvement
49 # over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
50 # Power 6 CPU, this module is 5-150% faster depending on key length,
51 # [hereafter] more for longer keys. But if compared to ppc-mont.pl
52 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
53 # in absolute terms, but it's apparently the way Power 6 is...
57 # Adapted for 32-bit build this module delivers 25-120%, yes, more
58 # than *twice* for longer keys, performance improvement over 32-bit
59 # ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
60 # even 64-bit integer operations and the trouble is that most PPC
61 # operating systems don't preserve upper halves of general purpose
62 # registers upon 32-bit signal delivery. They do preserve them upon
63 # context switch, but not signalling:-( This means that asynchronous
64 # signals have to be blocked upon entry to this subroutine. Signal
65 # masking (and of course complementary unmasking) has quite an impact
66 # on performance, naturally larger for shorter keys. It's so severe
67 # that 512-bit key performance can be as low as 1/3 of expected one.
68 # This is why this routine can be engaged for longer key operations
69 # only on these OSes, see crypto/ppccap.c for further details. MacOS X
70 # is an exception from this and doesn't require signal masking, and
71 # that's where above improvement coefficients were collected. For
72 # others alternative would be to break dependence on upper halves of
73 # GPRs by sticking to 32-bit integer operations...
77 # Remove above mentioned dependence on GPRs' upper halves in 32-bit
78 # build. No signal masking overhead, but integer instructions are
79 # *more* numerous... It's still "universally" faster than 32-bit
80 # ppc-mont.pl, but improvement coefficient is not as impressive
83 # $output is the last argument if it looks like a file (it has an extension)
84 # $flavour is the first argument if it doesn't look like a file
85 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
86 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
88 if ($flavour =~ /32/) {
91 $fname= "bn_mul_mont_fpu64";
93 $STUX= "stwux"; # store indexed and update
96 } elsif ($flavour =~ /64/) {
99 $fname= "bn_mul_mont_fpu64";
101 # same as above, but 64-bit mnemonics...
102 $STUX= "stdux"; # store indexed and update
105 } else { die "nonsense $flavour"; }
107 $LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0;
109 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
110 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
111 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
112 die "can't locate ppc-xlate.pl";
114 open STDOUT,"| $^X $xlate $flavour \"$output\""
115 or die "can't call $xlate: $!";
117 $FRAME=64; # padded frame header
129 $rp="r9"; # $rp is reassigned
133 # non-volatile registers
137 $nap_d="r22"; # interleaved ap and np in double format
139 $t0="r24"; # temporary registers
148 # PPC offers enough register bank capacity to unroll inner loops twice
172 $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
173 $na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
174 $dota="f8"; $dotb="f9";
175 $A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
176 $N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
177 $T0a="f24"; $T0b="f25";
178 $T1a="f26"; $T1b="f27";
179 $T2a="f28"; $T2b="f29";
180 $T3a="f30"; $T3b="f31";
182 # sp----------->+-------------------------------+
184 # +-------------------------------+
186 # +64 +-------------------------------+
187 # | 16 gpr<->fpr transfer zone |
190 # +16*8 +-------------------------------+
191 # | __int64 tmp[-1] |
192 # +-------------------------------+
193 # | __int64 tmp[num] |
197 # +(num+1)*8 +-------------------------------+
198 # | padding to 64 byte boundary |
200 # +X +-------------------------------+
201 # | double nap_d[4*num] |
205 # +-------------------------------+
207 # -13*size_t +-------------------------------+
208 # | 13 saved gpr, r19-r31 |
211 # -12*8 +-------------------------------+
212 # | 12 saved fpr, f20-f31 |
215 # +-------------------------------+
224 cmpwi $num,`3*8/$SIZE_T`
225 mr $rp,r3 ; $rp is reassigned
226 li r3,0 ; possible "not handled" return code
228 andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
231 slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
233 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
234 add $tp,$tp,$num ; place for tp[num+1]
235 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
236 subf $tp,$tp,$sp ; $sp-$tp
237 and $tp,$tp,$i ; minimize TLB usage
238 subf $tp,$sp,$tp ; $tp-$sp
240 $STUX $sp,$sp,$tp ; alloca
242 $PUSH r19,`-12*8-13*$SIZE_T`($i)
243 $PUSH r20,`-12*8-12*$SIZE_T`($i)
244 $PUSH r21,`-12*8-11*$SIZE_T`($i)
245 $PUSH r22,`-12*8-10*$SIZE_T`($i)
246 $PUSH r23,`-12*8-9*$SIZE_T`($i)
247 $PUSH r24,`-12*8-8*$SIZE_T`($i)
248 $PUSH r25,`-12*8-7*$SIZE_T`($i)
249 $PUSH r26,`-12*8-6*$SIZE_T`($i)
250 $PUSH r27,`-12*8-5*$SIZE_T`($i)
251 $PUSH r28,`-12*8-4*$SIZE_T`($i)
252 $PUSH r29,`-12*8-3*$SIZE_T`($i)
253 $PUSH r30,`-12*8-2*$SIZE_T`($i)
254 $PUSH r31,`-12*8-1*$SIZE_T`($i)
268 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
271 and $nap_d,$nap_d,$i ; align to 64 bytes
272 ; nap_d is off by 1, because it's used with stfdu/lfdu
273 addi $nap_d,$nap_d,-8
274 srwi $j,$num,`3+1` ; counter register, num/2
276 addi $tp,$sp,`$FRAME+$TRANSFER-8`
281 $code.=<<___ if ($SIZE_T==8);
282 ld $a0,0($ap) ; pull ap[0] value
283 ld $t3,0($bp) ; bp[0]
284 ld $n0,0($n0) ; pull n0[0] value
286 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
287 ; transfer bp[0] to FPU as 4x16-bit values
292 std $t0,`$FRAME+0`($sp)
293 std $t1,`$FRAME+8`($sp)
294 std $t2,`$FRAME+16`($sp)
295 std $t3,`$FRAME+24`($sp)
297 mulld $t7,$t7,$n0 ; tp[0]*n0
298 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
303 std $t4,`$FRAME+32`($sp)
304 std $t5,`$FRAME+40`($sp)
305 std $t6,`$FRAME+48`($sp)
306 std $t7,`$FRAME+56`($sp)
308 extrdi $t0,$a0,32,32 ; lwz $t0,4($ap)
309 extrdi $t1,$a0,32,0 ; lwz $t1,0($ap)
310 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[1] as 32-bit word pair
311 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
312 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[0] as 32-bit word pair
313 lwz $t5,`0^$LITTLE_ENDIAN`($np)
314 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[1] as 32-bit word pair
315 lwz $t7,`8^$LITTLE_ENDIAN`($np)
317 $code.=<<___ if ($SIZE_T==4);
318 lwz $a0,0($ap) ; pull ap[0,1] value
322 lwz $t1,0($bp) ; bp[0,1]
324 lwz $n0,0($n1) ; pull n0[0,1] value
327 mullw $t4,$a0,$t1 ; mulld ap[0]*bp[0]
333 ; transfer bp[0] to FPU as 4x16-bit values
338 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
339 std $t1,`$FRAME+8`($sp)
340 std $t2,`$FRAME+16`($sp)
341 std $t3,`$FRAME+24`($sp)
343 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
349 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
354 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
355 std $t5,`$FRAME+40`($sp)
356 std $t6,`$FRAME+48`($sp)
357 std $t7,`$FRAME+56`($sp)
359 mr $t0,$a0 ; lwz $t0,0($ap)
360 mr $t1,$a1 ; lwz $t1,4($ap)
361 lwz $t2,8($ap) ; load a[j..j+3] as 32-bit word pairs
363 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
369 lfd $ba,`$FRAME+0`($sp)
370 lfd $bb,`$FRAME+8`($sp)
371 lfd $bc,`$FRAME+16`($sp)
372 lfd $bd,`$FRAME+24`($sp)
373 lfd $na,`$FRAME+32`($sp)
374 lfd $nb,`$FRAME+40`($sp)
375 lfd $nc,`$FRAME+48`($sp)
376 lfd $nd,`$FRAME+56`($sp)
377 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
378 std $t1,`$FRAME+72`($sp)
379 std $t2,`$FRAME+80`($sp)
380 std $t3,`$FRAME+88`($sp)
381 std $t4,`$FRAME+96`($sp)
382 std $t5,`$FRAME+104`($sp)
383 std $t6,`$FRAME+112`($sp)
384 std $t7,`$FRAME+120`($sp)
394 lfd $A0,`$FRAME+64`($sp)
395 lfd $A1,`$FRAME+72`($sp)
396 lfd $A2,`$FRAME+80`($sp)
397 lfd $A3,`$FRAME+88`($sp)
398 lfd $N0,`$FRAME+96`($sp)
399 lfd $N1,`$FRAME+104`($sp)
400 lfd $N2,`$FRAME+112`($sp)
401 lfd $N3,`$FRAME+120`($sp)
415 stfd $A0,8($nap_d) ; save a[j] in double format
419 stfd $A2,24($nap_d) ; save a[j+1] in double format
423 stfd $N0,40($nap_d) ; save n[j] in double format
427 stfd $N2,56($nap_d) ; save n[j+1] in double format
430 fmadd $T1a,$A0,$bc,$T1a
431 fmadd $T1b,$A0,$bd,$T1b
432 fmadd $T2a,$A1,$bc,$T2a
433 fmadd $T2b,$A1,$bd,$T2b
434 fmadd $T3a,$A2,$bc,$T3a
435 fmadd $T3b,$A2,$bd,$T3b
439 fmadd $T1a,$N1,$na,$T1a
440 fmadd $T1b,$N1,$nb,$T1b
441 fmadd $T2a,$N2,$na,$T2a
442 fmadd $T2b,$N2,$nb,$T2b
443 fmadd $T3a,$N3,$na,$T3a
444 fmadd $T3b,$N3,$nb,$T3b
445 fmadd $T0a,$N0,$na,$T0a
446 fmadd $T0b,$N0,$nb,$T0b
448 fmadd $T1a,$N0,$nc,$T1a
449 fmadd $T1b,$N0,$nd,$T1b
450 fmadd $T2a,$N1,$nc,$T2a
451 fmadd $T2b,$N1,$nd,$T2b
452 fmadd $T3a,$N2,$nc,$T3a
453 fmadd $T3b,$N2,$nd,$T3b
454 fmadd $dota,$N3,$nc,$dota
455 fmadd $dotb,$N3,$nd,$dotb
466 stfd $T0a,`$FRAME+0`($sp)
467 stfd $T0b,`$FRAME+8`($sp)
468 stfd $T1a,`$FRAME+16`($sp)
469 stfd $T1b,`$FRAME+24`($sp)
470 stfd $T2a,`$FRAME+32`($sp)
471 stfd $T2b,`$FRAME+40`($sp)
472 stfd $T3a,`$FRAME+48`($sp)
473 stfd $T3b,`$FRAME+56`($sp)
478 $code.=<<___ if ($SIZE_T==8);
479 lwz $t0,`4^$LITTLE_ENDIAN`($ap) ; load a[j] as 32-bit word pair
480 lwz $t1,`0^$LITTLE_ENDIAN`($ap)
481 lwz $t2,`12^$LITTLE_ENDIAN`($ap) ; load a[j+1] as 32-bit word pair
482 lwz $t3,`8^$LITTLE_ENDIAN`($ap)
483 lwz $t4,`4^$LITTLE_ENDIAN`($np) ; load n[j] as 32-bit word pair
484 lwz $t5,`0^$LITTLE_ENDIAN`($np)
485 lwz $t6,`12^$LITTLE_ENDIAN`($np) ; load n[j+1] as 32-bit word pair
486 lwz $t7,`8^$LITTLE_ENDIAN`($np)
488 $code.=<<___ if ($SIZE_T==4);
489 lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
493 lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
499 std $t0,`$FRAME+64`($sp) ; yes, std even in 32-bit build
500 std $t1,`$FRAME+72`($sp)
501 std $t2,`$FRAME+80`($sp)
502 std $t3,`$FRAME+88`($sp)
503 std $t4,`$FRAME+96`($sp)
504 std $t5,`$FRAME+104`($sp)
505 std $t6,`$FRAME+112`($sp)
506 std $t7,`$FRAME+120`($sp)
508 if ($SIZE_T==8 or $flavour =~ /osx/) {
510 ld $t0,`$FRAME+0`($sp)
511 ld $t1,`$FRAME+8`($sp)
512 ld $t2,`$FRAME+16`($sp)
513 ld $t3,`$FRAME+24`($sp)
514 ld $t4,`$FRAME+32`($sp)
515 ld $t5,`$FRAME+40`($sp)
516 ld $t6,`$FRAME+48`($sp)
517 ld $t7,`$FRAME+56`($sp)
521 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
522 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
523 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
524 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
525 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
526 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
527 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
528 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
532 lfd $A0,`$FRAME+64`($sp)
533 lfd $A1,`$FRAME+72`($sp)
534 lfd $A2,`$FRAME+80`($sp)
535 lfd $A3,`$FRAME+88`($sp)
536 lfd $N0,`$FRAME+96`($sp)
537 lfd $N1,`$FRAME+104`($sp)
538 lfd $N2,`$FRAME+112`($sp)
539 lfd $N3,`$FRAME+120`($sp)
555 stfd $A0,8($nap_d) ; save a[j] in double format
559 fmadd $T0a,$A0,$ba,$dota
560 fmadd $T0b,$A0,$bb,$dotb
561 stfd $A2,24($nap_d) ; save a[j+1] in double format
564 if ($SIZE_T==8 or $flavour =~ /osx/) {
566 fmadd $T1a,$A0,$bc,$T1a
567 fmadd $T1b,$A0,$bd,$T1b
568 fmadd $T2a,$A1,$bc,$T2a
569 fmadd $T2b,$A1,$bd,$T2b
570 stfd $N0,40($nap_d) ; save n[j] in double format
572 fmadd $T3a,$A2,$bc,$T3a
573 fmadd $T3b,$A2,$bd,$T3b
574 add $t0,$t0,$carry ; can not overflow
577 stfd $N2,56($nap_d) ; save n[j+1] in double format
583 fmadd $T1a,$N1,$na,$T1a
584 fmadd $T1b,$N1,$nb,$T1b
586 fmadd $T2a,$N2,$na,$T2a
587 fmadd $T2b,$N2,$nb,$T2b
589 fmadd $T3a,$N3,$na,$T3a
590 fmadd $T3b,$N3,$nb,$T3b
592 fmadd $T0a,$N0,$na,$T0a
593 fmadd $T0b,$N0,$nb,$T0b
598 fmadd $T1a,$N0,$nc,$T1a
599 fmadd $T1b,$N0,$nd,$T1b
600 insrdi $t0,$t3,16,0 ; 0..63 bits
601 fmadd $T2a,$N1,$nc,$T2a
602 fmadd $T2b,$N1,$nd,$T2b
604 fmadd $T3a,$N2,$nc,$T3a
605 fmadd $T3b,$N2,$nd,$T3b
607 fmadd $dota,$N3,$nc,$dota
608 fmadd $dotb,$N3,$nd,$dotb
625 insrdi $t4,$t7,16,0 ; 64..127 bits
626 srdi $carry,$t7,16 ; upper 33 bits
628 stfd $T0a,`$FRAME+0`($sp)
629 stfd $T0b,`$FRAME+8`($sp)
630 stfd $T1a,`$FRAME+16`($sp)
631 stfd $T1b,`$FRAME+24`($sp)
632 stfd $T2a,`$FRAME+32`($sp)
633 stfd $T2b,`$FRAME+40`($sp)
634 stfd $T3a,`$FRAME+48`($sp)
635 stfd $T3b,`$FRAME+56`($sp)
636 std $t0,8($tp) ; tp[j-1]
637 stdu $t4,16($tp) ; tp[j]
641 fmadd $T1a,$A0,$bc,$T1a
642 fmadd $T1b,$A0,$bd,$T1b
646 fmadd $T2a,$A1,$bc,$T2a
647 fmadd $T2b,$A1,$bd,$T2b
648 stfd $N0,40($nap_d) ; save n[j] in double format
651 insrwi $carry,$t1,16,0
652 fmadd $T3a,$A2,$bc,$T3a
653 fmadd $T3b,$A2,$bd,$T3b
659 stfd $N2,56($nap_d) ; save n[j+1] in double format
661 insrwi $t0,$t2,16,0 ; 0..31 bits
663 insrwi $carry,$t3,16,0
665 fmadd $T1a,$N1,$na,$T1a
666 fmadd $T1b,$N1,$nb,$T1b
667 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
668 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
672 fmadd $T2a,$N2,$na,$T2a
673 fmadd $T2b,$N2,$nb,$T2b
675 insrwi $carry,$t5,16,0
676 fmadd $T3a,$N3,$na,$T3a
677 fmadd $T3b,$N3,$nb,$T3b
681 fmadd $T0a,$N0,$na,$T0a
682 fmadd $T0b,$N0,$nb,$T0b
683 insrwi $t4,$t6,16,0 ; 32..63 bits
685 insrwi $carry,$t7,16,0
687 fmadd $T1a,$N0,$nc,$T1a
688 fmadd $T1b,$N0,$nd,$T1b
689 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
690 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
694 fmadd $T2a,$N1,$nc,$T2a
695 fmadd $T2b,$N1,$nd,$T2b
696 stw $t0,12($tp) ; tp[j-1]
699 insrwi $carry,$t3,16,0
700 fmadd $T3a,$N2,$nc,$T3a
701 fmadd $T3b,$N2,$nd,$T3b
702 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
703 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
707 fmadd $dota,$N3,$nc,$dota
708 fmadd $dotb,$N3,$nd,$dotb
709 insrwi $t2,$t6,16,0 ; 64..95 bits
711 insrwi $carry,$t7,16,0
715 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
716 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
723 insrwi $carry,$t1,16,0
731 insrwi $t0,$t4,16,0 ; 96..127 bits
733 insrwi $carry,$t5,16,0
735 stfd $T0a,`$FRAME+0`($sp)
736 stfd $T0b,`$FRAME+8`($sp)
737 stfd $T1a,`$FRAME+16`($sp)
738 stfd $T1b,`$FRAME+24`($sp)
739 stfd $T2a,`$FRAME+32`($sp)
740 stfd $T2b,`$FRAME+40`($sp)
741 stfd $T3a,`$FRAME+48`($sp)
742 stfd $T3b,`$FRAME+56`($sp)
743 stw $t2,20($tp) ; tp[j]
753 if ($SIZE_T==8 or $flavour =~ /osx/) {
755 ld $t0,`$FRAME+0`($sp)
756 ld $t1,`$FRAME+8`($sp)
757 ld $t2,`$FRAME+16`($sp)
758 ld $t3,`$FRAME+24`($sp)
759 ld $t4,`$FRAME+32`($sp)
760 ld $t5,`$FRAME+40`($sp)
761 ld $t6,`$FRAME+48`($sp)
762 ld $t7,`$FRAME+56`($sp)
763 stfd $dota,`$FRAME+64`($sp)
764 stfd $dotb,`$FRAME+72`($sp)
766 add $t0,$t0,$carry ; can not overflow
776 insrdi $t0,$t3,16,0 ; 0..63 bits
786 insrdi $t4,$t7,16,0 ; 64..127 bits
787 srdi $carry,$t7,16 ; upper 33 bits
788 ld $t6,`$FRAME+64`($sp)
789 ld $t7,`$FRAME+72`($sp)
791 std $t0,8($tp) ; tp[j-1]
792 stdu $t4,16($tp) ; tp[j]
794 add $t6,$t6,$carry ; can not overflow
799 std $t6,8($tp) ; tp[num-1]
803 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
804 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
805 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
806 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
807 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
808 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
809 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
810 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
811 stfd $dota,`$FRAME+64`($sp)
812 stfd $dotb,`$FRAME+72`($sp)
817 insrwi $carry,$t1,16,0
822 insrwi $t0,$t2,16,0 ; 0..31 bits
823 insrwi $carry,$t3,16,0
828 insrwi $carry,$t5,16,0
833 insrwi $t4,$t6,16,0 ; 32..63 bits
834 insrwi $carry,$t7,16,0
836 stw $t0,12($tp) ; tp[j-1]
839 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
840 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
841 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
842 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
843 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
844 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
845 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
846 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
851 insrwi $carry,$t3,16,0
856 insrwi $t2,$t6,16,0 ; 64..95 bits
857 insrwi $carry,$t7,16,0
862 insrwi $carry,$t1,16,0
867 insrwi $t0,$t4,16,0 ; 96..127 bits
868 insrwi $carry,$t5,16,0
870 stw $t2,20($tp) ; tp[j]
873 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
874 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
875 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
876 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
881 insrwi $carry,$t7,16,0
890 stw $t6,12($tp) ; tp[num-1]
896 subf $nap_d,$t7,$nap_d ; rewind pointer
901 addi $tp,$sp,`$FRAME+$TRANSFER`
905 $code.=<<___ if ($SIZE_T==8);
906 ldx $t3,$bp,$i ; bp[i]
908 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
909 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
910 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
911 ; transfer bp[i] to FPU as 4x16-bit values
916 std $t0,`$FRAME+0`($sp)
917 std $t1,`$FRAME+8`($sp)
918 std $t2,`$FRAME+16`($sp)
919 std $t3,`$FRAME+24`($sp)
921 mulld $t7,$t7,$n0 ; tp[0]*n0
922 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
927 std $t4,`$FRAME+32`($sp)
928 std $t5,`$FRAME+40`($sp)
929 std $t6,`$FRAME+48`($sp)
930 std $t7,`$FRAME+56`($sp)
932 $code.=<<___ if ($SIZE_T==4);
935 lwz $t1,0($t0) ; bp[i,i+1]
938 mullw $t4,$a0,$t1 ; ap[0]*bp[i]
939 lwz $t0,`$FRAME+$TRANSFER+8+4`($sp) ; tp[0]
941 lwz $t2,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
946 addc $t4,$t4,$t0 ; ap[0]*bp[i]+tp[0]
948 ; transfer bp[i] to FPU as 4x16-bit values
953 std $t0,`$FRAME+0`($sp) ; yes, std in 32-bit build
954 std $t1,`$FRAME+8`($sp)
955 std $t2,`$FRAME+16`($sp)
956 std $t3,`$FRAME+24`($sp)
958 mullw $t0,$t4,$n0 ; mulld tp[0]*n0
964 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
969 std $t4,`$FRAME+32`($sp) ; yes, std in 32-bit build
970 std $t5,`$FRAME+40`($sp)
971 std $t6,`$FRAME+48`($sp)
972 std $t7,`$FRAME+56`($sp)
975 lfd $A0,8($nap_d) ; load a[j] in double format
977 lfd $A2,24($nap_d) ; load a[j+1] in double format
979 lfd $N0,40($nap_d) ; load n[j] in double format
981 lfd $N2,56($nap_d) ; load n[j+1] in double format
984 lfd $ba,`$FRAME+0`($sp)
985 lfd $bb,`$FRAME+8`($sp)
986 lfd $bc,`$FRAME+16`($sp)
987 lfd $bd,`$FRAME+24`($sp)
988 lfd $na,`$FRAME+32`($sp)
989 lfd $nb,`$FRAME+40`($sp)
990 lfd $nc,`$FRAME+48`($sp)
991 lfd $nd,`$FRAME+56`($sp)
1011 fmadd $T1a,$A0,$bc,$T1a
1012 fmadd $T1b,$A0,$bd,$T1b
1013 fmadd $T2a,$A1,$bc,$T2a
1014 fmadd $T2b,$A1,$bd,$T2b
1015 fmadd $T3a,$A2,$bc,$T3a
1016 fmadd $T3b,$A2,$bd,$T3b
1020 fmadd $T1a,$N1,$na,$T1a
1021 fmadd $T1b,$N1,$nb,$T1b
1022 lfd $A0,8($nap_d) ; load a[j] in double format
1024 fmadd $T2a,$N2,$na,$T2a
1025 fmadd $T2b,$N2,$nb,$T2b
1026 lfd $A2,24($nap_d) ; load a[j+1] in double format
1028 fmadd $T3a,$N3,$na,$T3a
1029 fmadd $T3b,$N3,$nb,$T3b
1030 fmadd $T0a,$N0,$na,$T0a
1031 fmadd $T0b,$N0,$nb,$T0b
1033 fmadd $T1a,$N0,$nc,$T1a
1034 fmadd $T1b,$N0,$nd,$T1b
1035 fmadd $T2a,$N1,$nc,$T2a
1036 fmadd $T2b,$N1,$nd,$T2b
1037 fmadd $T3a,$N2,$nc,$T3a
1038 fmadd $T3b,$N2,$nd,$T3b
1039 fmadd $dota,$N3,$nc,$dota
1040 fmadd $dotb,$N3,$nd,$dotb
1051 stfd $T0a,`$FRAME+0`($sp)
1052 stfd $T0b,`$FRAME+8`($sp)
1053 stfd $T1a,`$FRAME+16`($sp)
1054 stfd $T1b,`$FRAME+24`($sp)
1055 stfd $T2a,`$FRAME+32`($sp)
1056 stfd $T2b,`$FRAME+40`($sp)
1057 stfd $T3a,`$FRAME+48`($sp)
1058 stfd $T3b,`$FRAME+56`($sp)
1066 lfd $N0,40($nap_d) ; load n[j] in double format
1070 fmadd $T0a,$A0,$ba,$dota
1071 fmadd $T0b,$A0,$bb,$dotb
1072 lfd $N2,56($nap_d) ; load n[j+1] in double format
1075 fmadd $T1a,$A0,$bc,$T1a
1076 fmadd $T1b,$A0,$bd,$T1b
1077 fmadd $T2a,$A1,$bc,$T2a
1078 fmadd $T2b,$A1,$bd,$T2b
1079 lfd $A0,8($nap_d) ; load a[j] in double format
1081 fmadd $T3a,$A2,$bc,$T3a
1082 fmadd $T3b,$A2,$bd,$T3b
1085 lfd $A2,24($nap_d) ; load a[j+1] in double format
1088 if ($SIZE_T==8 or $flavour =~ /osx/) {
1090 fmadd $T1a,$N1,$na,$T1a
1091 fmadd $T1b,$N1,$nb,$T1b
1092 ld $t0,`$FRAME+0`($sp)
1093 ld $t1,`$FRAME+8`($sp)
1094 fmadd $T2a,$N2,$na,$T2a
1095 fmadd $T2b,$N2,$nb,$T2b
1096 ld $t2,`$FRAME+16`($sp)
1097 ld $t3,`$FRAME+24`($sp)
1098 fmadd $T3a,$N3,$na,$T3a
1099 fmadd $T3b,$N3,$nb,$T3b
1100 add $t0,$t0,$carry ; can not overflow
1101 ld $t4,`$FRAME+32`($sp)
1102 ld $t5,`$FRAME+40`($sp)
1103 fmadd $T0a,$N0,$na,$T0a
1104 fmadd $T0b,$N0,$nb,$T0b
1108 ld $t6,`$FRAME+48`($sp)
1109 ld $t7,`$FRAME+56`($sp)
1111 fmadd $T1a,$N0,$nc,$T1a
1112 fmadd $T1b,$N0,$nd,$T1b
1113 insrdi $t0,$t1,16,32
1114 ld $t1,8($tp) ; tp[j]
1115 fmadd $T2a,$N1,$nc,$T2a
1116 fmadd $T2b,$N1,$nd,$T2b
1118 fmadd $T3a,$N2,$nc,$T3a
1119 fmadd $T3b,$N2,$nd,$T3b
1121 insrdi $t0,$t2,16,16
1122 fmadd $dota,$N3,$nc,$dota
1123 fmadd $dotb,$N3,$nd,$dotb
1125 ldu $t2,16($tp) ; tp[j+1]
1127 insrdi $t0,$t3,16,0 ; 0..63 bits
1139 insrdi $t4,$t5,16,32
1144 insrdi $t4,$t6,16,16
1146 stfd $T0a,`$FRAME+0`($sp)
1147 stfd $T0b,`$FRAME+8`($sp)
1151 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1157 stfd $T1a,`$FRAME+16`($sp)
1158 stfd $T1b,`$FRAME+24`($sp)
1159 insrdi $t4,$t7,16,0 ; 64..127 bits
1160 srdi $carry,$t7,16 ; upper 33 bits
1161 stfd $T2a,`$FRAME+32`($sp)
1162 stfd $T2b,`$FRAME+40`($sp)
1165 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1171 stfd $T3a,`$FRAME+48`($sp)
1172 stfd $T3b,`$FRAME+56`($sp)
1174 std $t3,-16($tp) ; tp[j-1]
1175 std $t5,-8($tp) ; tp[j]
1179 fmadd $T1a,$N1,$na,$T1a
1180 fmadd $T1b,$N1,$nb,$T1b
1181 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1182 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1183 fmadd $T2a,$N2,$na,$T2a
1184 fmadd $T2b,$N2,$nb,$T2b
1185 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1186 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1187 fmadd $T3a,$N3,$na,$T3a
1188 fmadd $T3b,$N3,$nb,$T3b
1189 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1190 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1194 fmadd $T0a,$N0,$na,$T0a
1195 fmadd $T0b,$N0,$nb,$T0b
1196 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1197 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1199 insrwi $carry,$t1,16,0
1201 fmadd $T1a,$N0,$nc,$T1a
1202 fmadd $T1b,$N0,$nd,$T1b
1206 fmadd $T2a,$N1,$nc,$T2a
1207 fmadd $T2b,$N1,$nd,$T2b
1208 insrwi $t0,$t2,16,0 ; 0..31 bits
1210 insrwi $carry,$t3,16,0
1211 fmadd $T3a,$N2,$nc,$T3a
1212 fmadd $T3b,$N2,$nd,$T3b
1213 lwz $t2,12($tp) ; tp[j]
1218 fmadd $dota,$N3,$nc,$dota
1219 fmadd $dotb,$N3,$nd,$dotb
1221 insrwi $carry,$t5,16,0
1228 insrwi $t4,$t6,16,0 ; 32..63 bits
1230 insrwi $carry,$t7,16,0
1234 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1235 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1239 stw $t0,4($tp) ; tp[j-1]
1245 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1246 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1249 insrwi $carry,$t3,16,0
1250 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1251 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1256 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1257 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1260 insrwi $t2,$t6,16,0 ; 64..95 bits
1261 insrwi $carry,$t7,16,0
1266 stfd $T0a,`$FRAME+0`($sp)
1269 stfd $T0b,`$FRAME+8`($sp)
1270 insrwi $carry,$t1,16,0
1273 stfd $T1a,`$FRAME+16`($sp)
1276 insrwi $t0,$t4,16,0 ; 96..127 bits
1277 stfd $T1b,`$FRAME+24`($sp)
1278 insrwi $carry,$t5,16,0
1282 stfd $T2a,`$FRAME+32`($sp)
1284 stfd $T2b,`$FRAME+40`($sp)
1286 stfd $T3a,`$FRAME+48`($sp)
1288 stfd $T3b,`$FRAME+56`($sp)
1289 stw $t2,-4($tp) ; tp[j]
1299 if ($SIZE_T==8 or $flavour =~ /osx/) {
1301 ld $t0,`$FRAME+0`($sp)
1302 ld $t1,`$FRAME+8`($sp)
1303 ld $t2,`$FRAME+16`($sp)
1304 ld $t3,`$FRAME+24`($sp)
1305 ld $t4,`$FRAME+32`($sp)
1306 ld $t5,`$FRAME+40`($sp)
1307 ld $t6,`$FRAME+48`($sp)
1308 ld $t7,`$FRAME+56`($sp)
1309 stfd $dota,`$FRAME+64`($sp)
1310 stfd $dotb,`$FRAME+72`($sp)
1312 add $t0,$t0,$carry ; can not overflow
1316 insrdi $t0,$t1,16,32
1318 ld $t1,8($tp) ; tp[j]
1320 insrdi $t0,$t2,16,16
1322 ldu $t2,16($tp) ; tp[j+1]
1324 insrdi $t0,$t3,16,0 ; 0..63 bits
1329 insrdi $t4,$t5,16,32
1332 insrdi $t4,$t6,16,16
1334 insrdi $t4,$t7,16,0 ; 64..127 bits
1335 srdi $carry,$t7,16 ; upper 33 bits
1336 ld $t6,`$FRAME+64`($sp)
1337 ld $t7,`$FRAME+72`($sp)
1341 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1349 $code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
1357 std $t3,-16($tp) ; tp[j-1]
1358 std $t5,-8($tp) ; tp[j]
1360 add $carry,$carry,$ovf ; consume upmost overflow
1361 add $t6,$t6,$carry ; can not overflow
1366 std $t6,0($tp) ; tp[num-1]
1370 lwz $t1,`$FRAME+0^$LITTLE_ENDIAN`($sp)
1371 lwz $t0,`$FRAME+4^$LITTLE_ENDIAN`($sp)
1372 lwz $t3,`$FRAME+8^$LITTLE_ENDIAN`($sp)
1373 lwz $t2,`$FRAME+12^$LITTLE_ENDIAN`($sp)
1374 lwz $t5,`$FRAME+16^$LITTLE_ENDIAN`($sp)
1375 lwz $t4,`$FRAME+20^$LITTLE_ENDIAN`($sp)
1376 lwz $t7,`$FRAME+24^$LITTLE_ENDIAN`($sp)
1377 lwz $t6,`$FRAME+28^$LITTLE_ENDIAN`($sp)
1378 stfd $dota,`$FRAME+64`($sp)
1379 stfd $dotb,`$FRAME+72`($sp)
1384 insrwi $carry,$t1,16,0
1389 insrwi $t0,$t2,16,0 ; 0..31 bits
1390 lwz $t2,12($tp) ; tp[j]
1391 insrwi $carry,$t3,16,0
1397 insrwi $carry,$t5,16,0
1402 insrwi $t4,$t6,16,0 ; 32..63 bits
1403 insrwi $carry,$t7,16,0
1410 stw $t0,4($tp) ; tp[j-1]
1413 lwz $t3,`$FRAME+32^$LITTLE_ENDIAN`($sp) ; permuted $t1
1414 lwz $t2,`$FRAME+36^$LITTLE_ENDIAN`($sp) ; permuted $t0
1415 lwz $t7,`$FRAME+40^$LITTLE_ENDIAN`($sp) ; permuted $t3
1416 lwz $t6,`$FRAME+44^$LITTLE_ENDIAN`($sp) ; permuted $t2
1417 lwz $t1,`$FRAME+48^$LITTLE_ENDIAN`($sp) ; permuted $t5
1418 lwz $t0,`$FRAME+52^$LITTLE_ENDIAN`($sp) ; permuted $t4
1419 lwz $t5,`$FRAME+56^$LITTLE_ENDIAN`($sp) ; permuted $t7
1420 lwz $t4,`$FRAME+60^$LITTLE_ENDIAN`($sp) ; permuted $t6
1425 insrwi $carry,$t3,16,0
1430 insrwi $t2,$t6,16,0 ; 64..95 bits
1432 insrwi $carry,$t7,16,0
1438 insrwi $carry,$t1,16,0
1443 insrwi $t0,$t4,16,0 ; 96..127 bits
1444 insrwi $carry,$t5,16,0
1449 lwz $t7,`$FRAME+64^$LITTLE_ENDIAN`($sp)
1450 lwz $t6,`$FRAME+68^$LITTLE_ENDIAN`($sp)
1453 lwz $t5,`$FRAME+72^$LITTLE_ENDIAN`($sp)
1454 lwz $t4,`$FRAME+76^$LITTLE_ENDIAN`($sp)
1458 stw $t2,-4($tp) ; tp[j]
1463 insrwi $carry,$t7,16,0
1472 stw $t6,4($tp) ; tp[num-1]
1479 subf $nap_d,$t7,$nap_d ; rewind pointer
1484 $code.=<<___ if ($SIZE_T==8);
1485 subf $np,$num,$np ; rewind np
1486 addi $j,$j,1 ; restore counter
1487 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1488 addi $tp,$sp,`$FRAME+$TRANSFER+8`
1489 addi $t4,$sp,`$FRAME+$TRANSFER+16`
1495 Lsub: ldx $t0,$tp,$i
1499 subfe $t0,$t1,$t0 ; tp[j]-np[j]
1500 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
1507 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1511 Lcopy: ; conditional copy
1516 std $i,8($nap_d) ; zap nap_d
1532 stdx $i,$tp,$i ; zap tp at once
1537 $code.=<<___ if ($SIZE_T==4);
1538 subf $np,$num,$np ; rewind np
1539 addi $j,$j,1 ; restore counter
1540 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
1541 addi $tp,$sp,`$FRAME+$TRANSFER`
1544 addi $ap,$sp,`$FRAME+$TRANSFER+4`
1548 Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
1552 lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
1556 subfe $t4,$t4,$t0 ; tp[j]-np[j]
1557 stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
1558 subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
1560 subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
1562 subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
1571 subfe $ovf,$i,$ovf ; handle upmost overflow bit
1572 addi $ap,$sp,`$FRAME+$TRANSFER+4`
1573 subf $rp,$num,$rp ; rewind rp
1574 addi $tp,$sp,`$FRAME+$TRANSFER`
1578 Lcopy: ; conditional copy
1587 std $i,8($nap_d) ; zap nap_d
1611 std $i,8($tp) ; zap tp at once
1618 li r3,1 ; signal "handled"
1619 $POP r19,`-12*8-13*$SIZE_T`($i)
1620 $POP r20,`-12*8-12*$SIZE_T`($i)
1621 $POP r21,`-12*8-11*$SIZE_T`($i)
1622 $POP r22,`-12*8-10*$SIZE_T`($i)
1623 $POP r23,`-12*8-9*$SIZE_T`($i)
1624 $POP r24,`-12*8-8*$SIZE_T`($i)
1625 $POP r25,`-12*8-7*$SIZE_T`($i)
1626 $POP r26,`-12*8-6*$SIZE_T`($i)
1627 $POP r27,`-12*8-5*$SIZE_T`($i)
1628 $POP r28,`-12*8-4*$SIZE_T`($i)
1629 $POP r29,`-12*8-3*$SIZE_T`($i)
1630 $POP r30,`-12*8-2*$SIZE_T`($i)
1631 $POP r31,`-12*8-1*$SIZE_T`($i)
1647 .byte 0,12,4,0,0x8c,13,6,0
1649 .size .$fname,.-.$fname
1651 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1654 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1656 close STDOUT or die "error closing STDOUT: $!";