3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # The reason for undertaken effort is basically following. Even though
13 # Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14 # performance was observed to be less than impressive, essentially as
15 # fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16 # Well, it's not surprising that IBM had to make some sacrifices to
17 # boost the clock frequency that much, but no overall improvement?
18 # Having observed how much difference did switching to FPU make on
19 # UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20 # Unfortunately the resulting performance improvement is not as
21 # impressive, ~30%, and in absolute terms is still very far from what
22 # one would expect from 4.7GHz CPU. There is a chance that I'm doing
23 # something wrong, but in the lack of assembler level micro-profiling
24 # data or at least decent platform guide I can't tell... Or better
25 # results might be achieved with VMX... Anyway, this module provides
26 # *worse* performance on other PowerPC implementations, ~40-15% slower
27 # on PPC970 depending on key length and ~40% slower on Power 5 for all
28 # key lengths. As it's obviously inappropriate as "best all-round"
29 # alternative, it has to be complemented with run-time CPU family
30 # detection. Oh! It should also be noted that unlike other PowerPC
31 # implementation IALU ppc-mont.pl module performs *suboptimaly* on
32 # >=1024-bit key lengths on Power 6. It should also be noted that
33 # *everything* said so far applies to 64-bit builds! As far as 32-bit
34 # application executed on 64-bit CPU goes, this module is likely to
35 # become preferred choice, because it's easy to adapt it for such
36 # case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
40 if ($output =~ /32\-mont\.s/) {
43 $FRAME= $SIZE_T*12+8*12;
44 $fname= "bn_mul_mont_ppc64";
46 $STUX= "stwux"; # store indexed and update
49 die "not implemented yet";
50 } elsif ($output =~ /64\-mont\.s/) {
53 $FRAME= $SIZE_T*12+8*12;
54 $fname= "bn_mul_mont";
56 # same as above, but 64-bit mnemonics...
57 $STUX= "stdux"; # store indexed and update
60 } else { die "nonsense $output"; }
62 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
63 die "can't call ../perlasm/ppc-xlate.pl: $!";
65 $FRAME=($FRAME+63)&~63;
77 $rp="r9"; # $rp is reassigned
81 # non-volatile registers
82 $nap_d="r14"; # interleaved ap and np in double format
84 $t0="r16"; # temporary registers
93 # PPC offers enough register bank capacity to unroll inner loops twice
144 # sp----------->+-------------------------------+
146 # +-------------------------------+
148 # +-------------------------------+
149 # | 10 saved gpr, r14-r23 |
152 # +12*size_t +-------------------------------+
153 # | 12 saved fpr, f14-f25 |
156 # +12*8 +-------------------------------+
157 # | padding to 64 byte boundary |
159 # +-------------------------------+
160 # | 16 gpr<->fpr transfer zone |
163 # +8*8 +-------------------------------+
164 # | __int64 tmp[-1] |
165 # +-------------------------------+
166 # | __int64 tmp[num] |
170 # +(num+1)*8 +-------------------------------+
171 # | padding to 64 byte boundary |
173 # +-------------------------------+
174 # | double nap_d[4*num] |
178 # +-------------------------------+
188 mr $rp,r3 ; $rp is reassigned
189 li r3,0 ; possible "not handled" return code
191 andi. r0,$num,1 ; $num has to be even
194 slwi $num,$num,3 ; num*=8
196 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
197 add $tp,$tp,$num ; place for tp[num+1]
198 addi $tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
199 subf $tp,$tp,$sp ; $sp-$tp
200 and $tp,$tp,$i ; minimize TLB usage
201 subf $tp,$sp,$tp ; $tp-$sp
202 $STUX $sp,$sp,$tp ; alloca
204 $PUSH r14,`2*$SIZE_T`($sp)
205 $PUSH r15,`3*$SIZE_T`($sp)
206 $PUSH r16,`4*$SIZE_T`($sp)
207 $PUSH r17,`5*$SIZE_T`($sp)
208 $PUSH r18,`6*$SIZE_T`($sp)
209 $PUSH r19,`7*$SIZE_T`($sp)
210 $PUSH r20,`8*$SIZE_T`($sp)
211 $PUSH r21,`9*$SIZE_T`($sp)
212 $PUSH r22,`10*$SIZE_T`($sp)
213 $PUSH r23,`11*$SIZE_T`($sp)
214 stfd f14,`12*$SIZE_T+0`($sp)
215 stfd f15,`12*$SIZE_T+8`($sp)
216 stfd f16,`12*$SIZE_T+16`($sp)
217 stfd f17,`12*$SIZE_T+24`($sp)
218 stfd f18,`12*$SIZE_T+32`($sp)
219 stfd f19,`12*$SIZE_T+40`($sp)
220 stfd f20,`12*$SIZE_T+48`($sp)
221 stfd f21,`12*$SIZE_T+56`($sp)
222 stfd f22,`12*$SIZE_T+64`($sp)
223 stfd f23,`12*$SIZE_T+72`($sp)
224 stfd f24,`12*$SIZE_T+80`($sp)
225 stfd f25,`12*$SIZE_T+88`($sp)
227 ld $a0,0($ap) ; pull ap[0] value
228 ld $n0,0($n0) ; pull n0[0] value
229 ld $t3,0($bp) ; bp[0]
231 addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
234 and $nap_d,$nap_d,$i ; align to 64 bytes
236 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
237 ; nap_d is off by 1, because it's used with stfdu/lfdu
238 addi $nap_d,$nap_d,-8
239 srwi $j,$num,`3+1` ; counter register, num/2
240 mulld $t7,$t7,$n0 ; tp[0]*n0
242 addi $tp,$sp,`$FRAME+$TRANSFER-8`
246 ; transfer bp[0] to FPU as 4x16-bit values
251 std $t0,`$FRAME+0`($sp)
252 std $t1,`$FRAME+8`($sp)
253 std $t2,`$FRAME+16`($sp)
254 std $t3,`$FRAME+24`($sp)
255 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
260 std $t4,`$FRAME+32`($sp)
261 std $t5,`$FRAME+40`($sp)
262 std $t6,`$FRAME+48`($sp)
263 std $t7,`$FRAME+56`($sp)
264 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
266 lwz $t2,4($np) ; load n[j] as 32-bit word pair
268 lwz $t4,12($ap) ; load a[j+1] as 32-bit word pair
270 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
272 lfd $ba,`$FRAME+0`($sp)
273 lfd $bb,`$FRAME+8`($sp)
274 lfd $bc,`$FRAME+16`($sp)
275 lfd $bd,`$FRAME+24`($sp)
276 lfd $na,`$FRAME+32`($sp)
277 lfd $nb,`$FRAME+40`($sp)
278 lfd $nc,`$FRAME+48`($sp)
279 lfd $nd,`$FRAME+56`($sp)
280 std $t0,`$FRAME+64`($sp)
281 std $t1,`$FRAME+72`($sp)
282 std $t2,`$FRAME+80`($sp)
283 std $t3,`$FRAME+88`($sp)
284 std $t4,`$FRAME+96`($sp)
285 std $t5,`$FRAME+104`($sp)
286 std $t6,`$FRAME+112`($sp)
287 std $t7,`$FRAME+120`($sp)
297 lfd $A0,`$FRAME+64`($sp)
298 lfd $A1,`$FRAME+72`($sp)
299 lfd $N0,`$FRAME+80`($sp)
300 lfd $N1,`$FRAME+88`($sp)
301 lfd $A2,`$FRAME+96`($sp)
302 lfd $A3,`$FRAME+104`($sp)
303 lfd $N2,`$FRAME+112`($sp)
304 lfd $N3,`$FRAME+120`($sp)
318 stfd $A0,8($nap_d) ; save a[j] in double format
322 stfd $N0,24($nap_d) ; save n[j] in double format
326 stfd $A2,40($nap_d) ; save a[j+1] in double format
330 stfd $N2,56($nap_d) ; save n[j+1] in double format
333 fmadd $T1a,$A0,$bc,$T1a
334 fmadd $T1b,$A0,$bd,$T1b
335 fmadd $T2a,$A1,$bc,$T2a
336 fmadd $T2b,$A1,$bd,$T2b
337 fmadd $T3a,$A2,$bc,$T3a
338 fmadd $T3b,$A2,$bd,$T3b
342 fmadd $T1a,$N1,$na,$T1a
343 fmadd $T1b,$N1,$nb,$T1b
344 fmadd $T2a,$N2,$na,$T2a
345 fmadd $T2b,$N2,$nb,$T2b
346 fmadd $T3a,$N3,$na,$T3a
347 fmadd $T3b,$N3,$nb,$T3b
348 fmadd $T0a,$N0,$na,$T0a
349 fmadd $T0b,$N0,$nb,$T0b
351 fmadd $T1a,$N0,$nc,$T1a
352 fmadd $T1b,$N0,$nd,$T1b
353 fmadd $T2a,$N1,$nc,$T2a
354 fmadd $T2b,$N1,$nd,$T2b
355 fmadd $T3a,$N2,$nc,$T3a
356 fmadd $T3b,$N2,$nd,$T3b
357 fmadd $dota,$N3,$nc,$dota
358 fmadd $dotb,$N3,$nd,$dotb
369 stfd $T0a,`$FRAME+0`($sp)
370 stfd $T0b,`$FRAME+8`($sp)
371 stfd $T1a,`$FRAME+16`($sp)
372 stfd $T1b,`$FRAME+24`($sp)
373 stfd $T2a,`$FRAME+32`($sp)
374 stfd $T2b,`$FRAME+40`($sp)
375 stfd $T3a,`$FRAME+48`($sp)
376 stfd $T3b,`$FRAME+56`($sp)
380 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
382 lwz $t2,4($np) ; load n[j] as 32-bit word pair
384 lwz $t4,12($ap) ; load a[j+1] as 32-bit word pair
386 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
388 std $t0,`$FRAME+64`($sp)
389 std $t1,`$FRAME+72`($sp)
390 std $t2,`$FRAME+80`($sp)
391 std $t3,`$FRAME+88`($sp)
392 std $t4,`$FRAME+96`($sp)
393 std $t5,`$FRAME+104`($sp)
394 std $t6,`$FRAME+112`($sp)
395 std $t7,`$FRAME+120`($sp)
396 ld $t0,`$FRAME+0`($sp)
397 ld $t1,`$FRAME+8`($sp)
398 ld $t2,`$FRAME+16`($sp)
399 ld $t3,`$FRAME+24`($sp)
400 ld $t4,`$FRAME+32`($sp)
401 ld $t5,`$FRAME+40`($sp)
402 ld $t6,`$FRAME+48`($sp)
403 ld $t7,`$FRAME+56`($sp)
404 lfd $A0,`$FRAME+64`($sp)
405 lfd $A1,`$FRAME+72`($sp)
406 lfd $N0,`$FRAME+80`($sp)
407 lfd $N1,`$FRAME+88`($sp)
408 lfd $A2,`$FRAME+96`($sp)
409 lfd $A3,`$FRAME+104`($sp)
410 lfd $N2,`$FRAME+112`($sp)
411 lfd $N3,`$FRAME+120`($sp)
425 stfd $A0,8($nap_d) ; save a[j] in double format
429 stfd $N0,24($nap_d) ; save n[j] in double format
431 add $t0,$t0,$carry ; can not overflow
434 stfd $A2,40($nap_d) ; save a[j+1] in double format
436 fmadd $T0a,$A0,$ba,$dota
437 fmadd $T0b,$A0,$bb,$dotb
438 stfd $N2,56($nap_d) ; save n[j+1] in double format
443 fmadd $T1a,$A0,$bc,$T1a
444 fmadd $T1b,$A0,$bd,$T1b
445 fmadd $T2a,$A1,$bc,$T2a
446 fmadd $T2b,$A1,$bd,$T2b
449 fmadd $T3a,$A2,$bc,$T3a
450 fmadd $T3b,$A2,$bd,$T3b
457 fmadd $T1a,$N1,$na,$T1a
458 fmadd $T1b,$N1,$nb,$T1b
459 fmadd $T2a,$N2,$na,$T2a
460 fmadd $T2b,$N2,$nb,$T2b
462 fmadd $T3a,$N3,$na,$T3a
463 fmadd $T3b,$N3,$nb,$T3b
464 fmadd $T0a,$N0,$na,$T0a
465 fmadd $T0b,$N0,$nb,$T0b
467 insrdi $t0,$t3,16,0 ; 0..63 bits
470 fmadd $T1a,$N0,$nc,$T1a
471 fmadd $T1b,$N0,$nd,$T1b
472 fmadd $T2a,$N1,$nc,$T2a
473 fmadd $T2b,$N1,$nd,$T2b
475 fmadd $T3a,$N2,$nc,$T3a
476 fmadd $T3b,$N2,$nd,$T3b
477 fmadd $dota,$N3,$nc,$dota
478 fmadd $dotb,$N3,$nd,$dotb
495 insrdi $t4,$t7,16,0 ; 64..127 bits
496 srdi $carry,$t7,16 ; upper 33 bits
498 stfd $T0a,`$FRAME+0`($sp)
499 stfd $T0b,`$FRAME+8`($sp)
500 stfd $T1a,`$FRAME+16`($sp)
501 stfd $T1b,`$FRAME+24`($sp)
502 stfd $T2a,`$FRAME+32`($sp)
503 stfd $T2b,`$FRAME+40`($sp)
504 stfd $T3a,`$FRAME+48`($sp)
505 stfd $T3b,`$FRAME+56`($sp)
506 std $t0,8($tp) ; tp[j-1]
507 stdu $t4,16($tp) ; tp[j]
513 ld $t0,`$FRAME+0`($sp)
514 ld $t1,`$FRAME+8`($sp)
515 ld $t2,`$FRAME+16`($sp)
516 ld $t3,`$FRAME+24`($sp)
517 ld $t4,`$FRAME+32`($sp)
518 ld $t5,`$FRAME+40`($sp)
519 ld $t6,`$FRAME+48`($sp)
520 ld $t7,`$FRAME+56`($sp)
521 stfd $dota,`$FRAME+64`($sp)
522 stfd $dotb,`$FRAME+72`($sp)
524 add $t0,$t0,$carry ; can not overflow
534 insrdi $t0,$t3,16,0 ; 0..63 bits
544 insrdi $t4,$t7,16,0 ; 64..127 bits
545 srdi $carry,$t7,16 ; upper 33 bits
546 ld $t6,`$FRAME+64`($sp)
547 ld $t7,`$FRAME+72`($sp)
549 std $t0,8($tp) ; tp[j-1]
550 stdu $t4,16($tp) ; tp[j]
552 add $t6,$t6,$carry ; can not overflow
557 std $t6,8($tp) ; tp[num-1]
560 subf $nap_d,$t7,$nap_d ; rewind pointer
565 ldx $t3,$bp,$i ; bp[i]
566 ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
567 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
569 addi $tp,$sp,`$FRAME+$TRANSFER`
570 add $t7,$t7,$t6 ; ap[0]*bp[i]+tp[0]
572 mulld $t7,$t7,$n0 ; tp[0]*n0
575 ; transfer bp[i] to FPU as 4x16-bit values
580 std $t0,`$FRAME+0`($sp)
581 std $t1,`$FRAME+8`($sp)
582 std $t2,`$FRAME+16`($sp)
583 std $t3,`$FRAME+24`($sp)
584 ; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
589 std $t4,`$FRAME+32`($sp)
590 std $t5,`$FRAME+40`($sp)
591 std $t6,`$FRAME+48`($sp)
592 std $t7,`$FRAME+56`($sp)
594 lfd $A0,8($nap_d) ; load a[j] in double format
596 lfd $N0,24($nap_d) ; load n[j] in double format
598 lfd $A2,40($nap_d) ; load a[j+1] in double format
600 lfd $N2,56($nap_d) ; load n[j+1] in double format
603 lfd $ba,`$FRAME+0`($sp)
604 lfd $bb,`$FRAME+8`($sp)
605 lfd $bc,`$FRAME+16`($sp)
606 lfd $bd,`$FRAME+24`($sp)
607 lfd $na,`$FRAME+32`($sp)
608 lfd $nb,`$FRAME+40`($sp)
609 lfd $nc,`$FRAME+48`($sp)
610 lfd $nd,`$FRAME+56`($sp)
630 fmadd $T1a,$A0,$bc,$T1a
631 fmadd $T1b,$A0,$bd,$T1b
632 fmadd $T2a,$A1,$bc,$T2a
633 fmadd $T2b,$A1,$bd,$T2b
634 fmadd $T3a,$A2,$bc,$T3a
635 fmadd $T3b,$A2,$bd,$T3b
639 fmadd $T1a,$N1,$na,$T1a
640 fmadd $T1b,$N1,$nb,$T1b
641 fmadd $T2a,$N2,$na,$T2a
642 fmadd $T2b,$N2,$nb,$T2b
643 fmadd $T3a,$N3,$na,$T3a
644 fmadd $T3b,$N3,$nb,$T3b
645 fmadd $T0a,$N0,$na,$T0a
646 fmadd $T0b,$N0,$nb,$T0b
648 fmadd $T1a,$N0,$nc,$T1a
649 fmadd $T1b,$N0,$nd,$T1b
650 fmadd $T2a,$N1,$nc,$T2a
651 fmadd $T2b,$N1,$nd,$T2b
652 fmadd $T3a,$N2,$nc,$T3a
653 fmadd $T3b,$N2,$nd,$T3b
654 fmadd $dota,$N3,$nc,$dota
655 fmadd $dotb,$N3,$nd,$dotb
666 stfd $T0a,`$FRAME+0`($sp)
667 stfd $T0b,`$FRAME+8`($sp)
668 stfd $T1a,`$FRAME+16`($sp)
669 stfd $T1b,`$FRAME+24`($sp)
670 stfd $T2a,`$FRAME+32`($sp)
671 stfd $T2b,`$FRAME+40`($sp)
672 stfd $T3a,`$FRAME+48`($sp)
673 stfd $T3b,`$FRAME+56`($sp)
677 lfd $A0,8($nap_d) ; load a[j] in double format
679 lfd $N0,24($nap_d) ; load n[j] in double format
681 lfd $A2,40($nap_d) ; load a[j+1] in double format
683 lfd $N2,56($nap_d) ; load n[j+1] in double format
686 ld $t0,`$FRAME+0`($sp)
687 ld $t1,`$FRAME+8`($sp)
688 ld $t2,`$FRAME+16`($sp)
689 ld $t3,`$FRAME+24`($sp)
693 ld $t4,`$FRAME+32`($sp)
694 ld $t5,`$FRAME+40`($sp)
697 add $t0,$t0,$carry ; can not overflow
700 ld $t6,`$FRAME+48`($sp)
701 ld $t7,`$FRAME+56`($sp)
702 fmadd $T0a,$A0,$ba,$dota
703 fmadd $T0b,$A0,$bb,$dotb
706 fmadd $T1a,$A0,$bc,$T1a
707 fmadd $T1b,$A0,$bd,$T1b
708 fmadd $T2a,$A1,$bc,$T2a
709 fmadd $T2b,$A1,$bd,$T2b
712 fmadd $T3a,$A2,$bc,$T3a
713 fmadd $T3b,$A2,$bd,$T3b
717 ldu $t1,8($tp) ; tp[j]
720 fmadd $T1a,$N1,$na,$T1a
721 fmadd $T1b,$N1,$nb,$T1b
722 fmadd $T2a,$N2,$na,$T2a
723 fmadd $T2b,$N2,$nb,$T2b
725 ldu $t2,8($tp) ; tp[j+1]
726 fmadd $T3a,$N3,$na,$T3a
727 fmadd $T3b,$N3,$nb,$T3b
728 fmadd $T0a,$N0,$na,$T0a
729 fmadd $T0b,$N0,$nb,$T0b
731 insrdi $t0,$t3,16,0 ; 0..63 bits
733 fmadd $T1a,$N0,$nc,$T1a
734 fmadd $T1b,$N0,$nd,$T1b
735 fmadd $T2a,$N1,$nc,$T2a
736 fmadd $T2b,$N1,$nd,$T2b
738 fmadd $T3a,$N2,$nc,$T3a
739 fmadd $T3b,$N2,$nd,$T3b
740 fmadd $dota,$N3,$nc,$dota
741 fmadd $dotb,$N3,$nd,$dotb
757 stfd $T0a,`$FRAME+0`($sp)
758 stfd $T0b,`$FRAME+8`($sp)
761 stfd $T1a,`$FRAME+16`($sp)
762 stfd $T1b,`$FRAME+24`($sp)
763 insrdi $t4,$t7,16,0 ; 64..127 bits
764 srdi $carry,$t7,16 ; upper 33 bits
765 stfd $T2a,`$FRAME+32`($sp)
766 stfd $T2b,`$FRAME+40`($sp)
768 stfd $T3a,`$FRAME+48`($sp)
769 stfd $T3b,`$FRAME+56`($sp)
771 std $t3,-16($tp) ; tp[j-1]
772 std $t5,-8($tp) ; tp[j]
777 ld $t0,`$FRAME+0`($sp)
778 ld $t1,`$FRAME+8`($sp)
779 ld $t2,`$FRAME+16`($sp)
780 ld $t3,`$FRAME+24`($sp)
781 ld $t4,`$FRAME+32`($sp)
782 ld $t5,`$FRAME+40`($sp)
783 ld $t6,`$FRAME+48`($sp)
784 ld $t7,`$FRAME+56`($sp)
785 stfd $dota,`$FRAME+64`($sp)
786 stfd $dotb,`$FRAME+72`($sp)
788 add $t0,$t0,$carry ; can not overflow
794 ld $t1,8($tp) ; tp[j]
798 ldu $t2,16($tp) ; tp[j+1]
800 insrdi $t0,$t3,16,0 ; 0..63 bits
810 insrdi $t4,$t7,16,0 ; 64..127 bits
811 srdi $carry,$t7,16 ; upper 33 bits
812 ld $t6,`$FRAME+64`($sp)
813 ld $t7,`$FRAME+72`($sp)
819 std $t3,-16($tp) ; tp[j-1]
820 std $t5,-8($tp) ; tp[j]
822 add $carry,$carry,$ovf ; comsume upmost overflow
823 add $t6,$t6,$carry ; can not overflow
828 std $t6,0($tp) ; tp[num-1]
832 subf $nap_d,$t7,$nap_d ; rewind pointer
836 subf $np,$num,$np ; rewind np
837 addi $j,$j,1 ; restore counter
838 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
839 addi $tp,$sp,`$FRAME+$TRANSFER+8`
840 addi $t4,$sp,`$FRAME+$TRANSFER+16`
850 subfe $t0,$t1,$t0 ; tp[j]-np[j]
851 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
858 subfe $ovf,$i,$ovf ; handle upmost overflow bit
861 or $ap,$ap,$np ; ap=borrow?tp:rp
866 Lcopy: ; copy or in-place refresh
869 std $i,8($nap_d) ; zap nap_d
879 stdx $i,$tp,$i ; zap tp at once
884 $POP r14,`2*$SIZE_T`($sp)
885 $POP r15,`3*$SIZE_T`($sp)
886 $POP r16,`4*$SIZE_T`($sp)
887 $POP r17,`5*$SIZE_T`($sp)
888 $POP r18,`6*$SIZE_T`($sp)
889 $POP r19,`7*$SIZE_T`($sp)
890 $POP r20,`8*$SIZE_T`($sp)
891 $POP r21,`9*$SIZE_T`($sp)
892 $POP r22,`10*$SIZE_T`($sp)
893 $POP r23,`11*$SIZE_T`($sp)
894 lfd f14,`12*$SIZE_T+0`($sp)
895 lfd f15,`12*$SIZE_T+8`($sp)
896 lfd f16,`12*$SIZE_T+16`($sp)
897 lfd f17,`12*$SIZE_T+24`($sp)
898 lfd f18,`12*$SIZE_T+32`($sp)
899 lfd f19,`12*$SIZE_T+40`($sp)
900 lfd f20,`12*$SIZE_T+48`($sp)
901 lfd f21,`12*$SIZE_T+56`($sp)
902 lfd f22,`12*$SIZE_T+64`($sp)
903 lfd f23,`12*$SIZE_T+72`($sp)
904 lfd f24,`12*$SIZE_T+80`($sp)
905 lfd f25,`12*$SIZE_T+88`($sp)
907 li r3,1 ; signal "handled"
910 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
913 $code =~ s/\`([^\`]*)\`/eval $1/gem;