3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
14 if ($output =~ /32\-mont\.s/) {
17 $FRAME= $SIZE_T*16+8*12;
18 $fname= "bn_mul_mont_ppc64";
20 $STUX= "stwux"; # store indexed and update
23 die "not implemented yet";
24 } elsif ($output =~ /64\-mont\.s/) {
27 $FRAME= $SIZE_T*16+8*12;
28 $fname= "bn_mul_mont";
30 # same as above, but 64-bit mnemonics...
31 $STUX= "stdux"; # store indexed and update
34 } else { die "nonsense $output"; }
36 ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
37 die "can't call ../perlasm/ppc-xlate.pl: $!";
49 $rp="r9"; # $rp is reassigned
53 # non-volatile registers
69 # PPC offers enough register bank capacity to unroll inner loops twice
120 # sp----------->+-------------------------------+
122 # +-------------------------------+
124 # +-------------------------------+
125 # | 14 saved gpr, r14-r27 |
128 # +16*size_t +-------------------------------+
129 # | 12 saved fpr, f14-f25 |
132 # +12*8 +-------------------------------+
133 # | 8 gpr<->fpr transfer zone |
136 # +8*8 +-------------------------------+
137 # | __int64 tmp[-1] |
138 # +-------------------------------+
139 # | __int64 tmp[num] |
143 # +(num+1)*8 +-------------------------------+
144 # | double a_lo[num] |
148 # +num*8 +-------------------------------+
149 # | double a_hi[num] |
153 # +num*8 +-------------------------------+
154 # | double n_lo[num] |
158 # +num*8 +-------------------------------+
159 # | double n_hi[num] |
163 # +-------------------------------+
173 mr $rp,r3 ; $rp is reassigned
174 li r3,0 ; possible "not handled" return code
176 andi. r0,$num,1 ; $num has to be even
179 slwi $num,$num,3 ; num*=8
181 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
182 add $tp,$tp,$num ; place for tp[num+1]
183 addi $tp,$tp,`$FRAME+$TRANSFER+8+$RZONE`
184 subf $tp,$tp,$sp ; $sp-$tp
185 and $tp,$tp,$i ; minimize TLB usage
186 subf $tp,$sp,$tp ; $tp-$sp
187 $STUX $sp,$sp,$tp ; alloca
189 $PUSH r14,`2*$SIZE_T`($sp)
190 $PUSH r15,`3*$SIZE_T`($sp)
191 $PUSH r16,`4*$SIZE_T`($sp)
192 $PUSH r17,`5*$SIZE_T`($sp)
193 $PUSH r18,`6*$SIZE_T`($sp)
194 $PUSH r19,`7*$SIZE_T`($sp)
195 $PUSH r20,`8*$SIZE_T`($sp)
196 $PUSH r21,`9*$SIZE_T`($sp)
197 $PUSH r22,`10*$SIZE_T`($sp)
198 $PUSH r23,`11*$SIZE_T`($sp)
199 $PUSH r24,`12*$SIZE_T`($sp)
200 $PUSH r25,`13*$SIZE_T`($sp)
201 $PUSH r26,`14*$SIZE_T`($sp)
202 $PUSH r27,`15*$SIZE_T`($sp)
203 stfd f14,`16*$SIZE_T+0`($sp)
204 stfd f15,`16*$SIZE_T+8`($sp)
205 stfd f16,`16*$SIZE_T+16`($sp)
206 stfd f17,`16*$SIZE_T+24`($sp)
207 stfd f18,`16*$SIZE_T+32`($sp)
208 stfd f19,`16*$SIZE_T+40`($sp)
209 stfd f20,`16*$SIZE_T+48`($sp)
210 stfd f21,`16*$SIZE_T+56`($sp)
211 stfd f22,`16*$SIZE_T+64`($sp)
212 stfd f23,`16*$SIZE_T+72`($sp)
213 stfd f24,`16*$SIZE_T+80`($sp)
214 stfd f25,`16*$SIZE_T+88`($sp)
215 std r0,$FRAME($sp) ; r0 is still 0
216 lfd $dota,$FRAME($sp)
217 lfd $dotb,$FRAME($sp)
219 addi $tp,$sp,`$FRAME+$TRANSFER`
220 ; note that {an}p_{lh} are off by 1, this is because they
221 ; are used with stfdu/lfdu instruction...
227 ld $a0,0($ap) ; pull ap[0] value
228 ld $n0,0($n0) ; pull n0[0] value
229 srwi $j,$num,`3+1` ; counter register, num/2
231 ld $t3,0($bp) ; bp[0]
232 mulld $t7,$a0,$t3 ; ap[0]*bp[0]
233 mulld $t7,$t7,$n0 ; tp[0]*n0
235 ; transfer bp[0] to FPU as 4x16-bit values
240 std $t0,`$FRAME+0`($sp)
241 std $t1,`$FRAME+8`($sp)
242 std $t2,`$FRAME+16`($sp)
243 std $t3,`$FRAME+24`($sp)
244 lfd $ba,`$FRAME+0`($sp)
245 lfd $bb,`$FRAME+8`($sp)
246 lfd $bc,`$FRAME+16`($sp)
247 lfd $bd,`$FRAME+24`($sp)
253 ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
258 std $t4,`$FRAME+32`($sp)
259 std $t5,`$FRAME+40`($sp)
260 std $t6,`$FRAME+48`($sp)
261 std $t7,`$FRAME+56`($sp)
262 lfd $na,`$FRAME+32`($sp)
263 lfd $nb,`$FRAME+40`($sp)
264 lfd $nc,`$FRAME+48`($sp)
265 lfd $nd,`$FRAME+56`($sp)
272 addi $tp,$sp,`$FRAME+$TRANSFER-8`
276 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
278 lwz $t2,4($np) ; load n[j] as 32-bit word pair
280 std $t0,`$FRAME+0`($sp)
281 std $t1,`$FRAME+8`($sp)
282 std $t2,`$FRAME+16`($sp)
283 std $t3,`$FRAME+24`($sp)
284 lfd $A0,`$FRAME+0`($sp)
285 lfd $A1,`$FRAME+8`($sp)
286 lfd $N0,`$FRAME+16`($sp)
287 lfd $N1,`$FRAME+24`($sp)
292 stfdu $A0,8($ap_l) ; save a[j] in double format
294 stfdu $N0,8($np_l) ; save n[j] in double format
297 lwz $t4,12($ap) ; load a[j+1] as 32-bit word pair
299 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
301 std $t4,`$FRAME+32`($sp)
302 std $t5,`$FRAME+40`($sp)
303 std $t6,`$FRAME+48`($sp)
304 std $t7,`$FRAME+56`($sp)
305 lfd $A2,`$FRAME+32`($sp)
306 lfd $A3,`$FRAME+40`($sp)
307 lfd $N2,`$FRAME+48`($sp)
308 lfd $N3,`$FRAME+56`($sp)
313 stfdu $A2,8($ap_l) ; save a[j+1] in double format
315 stfdu $N2,8($np_l) ; save n[j+1] in double format
320 fmadd $T0a,$A0,$ba,$dota
321 fmadd $T0b,$A0,$bb,$dotb
329 fmadd $T1a,$A0,$bc,$T1a
330 fmadd $T1b,$A0,$bd,$T1b
331 fmadd $T2a,$A1,$bc,$T2a
332 fmadd $T2b,$A1,$bd,$T2b
333 fmadd $T3a,$A2,$bc,$T3a
334 fmadd $T3b,$A2,$bd,$T3b
338 fmadd $T0a,$N0,$na,$T0a
339 fmadd $T0b,$N0,$nb,$T0b
340 fmadd $T1a,$N1,$na,$T1a
341 fmadd $T1b,$N1,$nb,$T1b
342 fmadd $T2a,$N2,$na,$T2a
343 fmadd $T2b,$N2,$nb,$T2b
344 fmadd $T3a,$N3,$na,$T3a
345 fmadd $T3b,$N3,$nb,$T3b
347 fmadd $T1a,$N0,$nc,$T1a
348 fmadd $T1b,$N0,$nd,$T1b
349 fmadd $T2a,$N1,$nc,$T2a
350 fmadd $T2b,$N1,$nd,$T2b
351 fmadd $T3a,$N2,$nc,$T3a
352 fmadd $T3b,$N2,$nd,$T3b
353 fmadd $dota,$N3,$nc,$dota
354 fmadd $dotb,$N3,$nd,$dotb
365 stfd $T0a,`$FRAME+0`($sp)
366 stfd $T0b,`$FRAME+8`($sp)
367 stfd $T1a,`$FRAME+16`($sp)
368 stfd $T1b,`$FRAME+24`($sp)
369 stfd $T2a,`$FRAME+32`($sp)
370 stfd $T2b,`$FRAME+40`($sp)
371 stfd $T3a,`$FRAME+48`($sp)
372 stfd $T3b,`$FRAME+56`($sp)
375 ld $t0,`$FRAME+0`($sp)
376 ld $t1,`$FRAME+8`($sp)
377 ld $t2,`$FRAME+16`($sp)
378 ld $t3,`$FRAME+24`($sp)
379 ld $t4,`$FRAME+32`($sp)
380 ld $t5,`$FRAME+40`($sp)
381 ld $t6,`$FRAME+48`($sp)
382 ld $t7,`$FRAME+56`($sp)
384 add $t0,$t0,$carry ; can not overflow
402 insrdi $t0,$t3,16,0 ; 0..63 bits
405 insrdi $t4,$t7,16,0 ; 64..127 bits
406 srdi $carry,$t7,16 ; upper 33 bits
408 std $t0,8($tp) ; tp[j-1]
409 stdu $t4,16($tp) ; tp[j]
411 lwz $t0,4($ap) ; load a[j] as 32-bit word pair
413 lwz $t2,4($np) ; load n[j] as 32-bit word pair
415 std $t0,`$FRAME+0`($sp)
416 std $t1,`$FRAME+8`($sp)
417 std $t2,`$FRAME+16`($sp)
418 std $t3,`$FRAME+24`($sp)
419 lfd $A0,`$FRAME+0`($sp)
420 lfd $A1,`$FRAME+8`($sp)
421 lfd $N0,`$FRAME+16`($sp)
422 lfd $N1,`$FRAME+24`($sp)
427 stfdu $A0,8($ap_l) ; save a[j] in double format
429 stfdu $N0,8($np_l) ; save n[j] in double format
432 lwz $t4,12($ap) ; load a[j+1] as 32-bit word pair
434 lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
436 std $t4,`$FRAME+32`($sp)
437 std $t5,`$FRAME+40`($sp)
438 std $t6,`$FRAME+48`($sp)
439 std $t7,`$FRAME+56`($sp)
440 lfd $A2,`$FRAME+32`($sp)
441 lfd $A3,`$FRAME+40`($sp)
442 lfd $N2,`$FRAME+48`($sp)
443 lfd $N3,`$FRAME+56`($sp)
448 stfdu $A2,8($ap_l) ; save a[j+1] in double format
450 stfdu $N2,8($np_l) ; save n[j+1] in double format
455 fmadd $T0a,$A0,$ba,$dota
456 fmadd $T0b,$A0,$bb,$dotb
464 fmadd $T1a,$A0,$bc,$T1a
465 fmadd $T1b,$A0,$bd,$T1b
466 fmadd $T2a,$A1,$bc,$T2a
467 fmadd $T2b,$A1,$bd,$T2b
468 fmadd $T3a,$A2,$bc,$T3a
469 fmadd $T3b,$A2,$bd,$T3b
473 fmadd $T0a,$N0,$na,$T0a
474 fmadd $T0b,$N0,$nb,$T0b
475 fmadd $T1a,$N1,$na,$T1a
476 fmadd $T1b,$N1,$nb,$T1b
477 fmadd $T2a,$N2,$na,$T2a
478 fmadd $T2b,$N2,$nb,$T2b
479 fmadd $T3a,$N3,$na,$T3a
480 fmadd $T3b,$N3,$nb,$T3b
482 fmadd $T1a,$N0,$nc,$T1a
483 fmadd $T1b,$N0,$nd,$T1b
484 fmadd $T2a,$N1,$nc,$T2a
485 fmadd $T2b,$N1,$nd,$T2b
486 fmadd $T3a,$N2,$nc,$T3a
487 fmadd $T3b,$N2,$nd,$T3b
488 fmadd $dota,$N3,$nc,$dota
489 fmadd $dotb,$N3,$nd,$dotb
500 stfd $T0a,`$FRAME+0`($sp)
501 stfd $T0b,`$FRAME+8`($sp)
502 stfd $T1a,`$FRAME+16`($sp)
503 stfd $T1b,`$FRAME+24`($sp)
504 stfd $T2a,`$FRAME+32`($sp)
505 stfd $T2b,`$FRAME+40`($sp)
506 stfd $T3a,`$FRAME+48`($sp)
507 stfd $T3b,`$FRAME+56`($sp)
510 ld $t0,`$FRAME+0`($sp)
511 ld $t1,`$FRAME+8`($sp)
512 ld $t2,`$FRAME+16`($sp)
513 ld $t3,`$FRAME+24`($sp)
514 ld $t4,`$FRAME+32`($sp)
515 ld $t5,`$FRAME+40`($sp)
516 ld $t6,`$FRAME+48`($sp)
517 ld $t7,`$FRAME+56`($sp)
519 add $t0,$t0,$carry ; can not overflow
537 insrdi $t0,$t3,16,0 ; 0..63 bits
540 insrdi $t4,$t7,16,0 ; 64..127 bits
541 srdi $carry,$t7,16 ; upper 33 bits
543 std $t0,8($tp) ; tp[j-1]
544 stdu $t4,16($tp) ; tp[j]
548 stfd $dota,`$FRAME+0`($sp)
549 stfd $dotb,`$FRAME+8`($sp)
550 ld $t0,`$FRAME+0`($sp)
551 ld $t1,`$FRAME+8`($sp)
552 add $t0,$t0,$carry ; can not overflow
557 std $t0,8($tp) ; tp[num-1]
559 subf $ap_l,$num,$ap_l ; rewind pointers
560 subf $ap_h,$num,$ap_h
561 subf $np_l,$num,$np_l
562 subf $np_h,$num,$np_h
567 ldx $t3,$bp,$i ; bp[i]
568 ld $t0,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
569 mulld $t7,$a0,$t3 ; ap[0]*bp[i]
570 add $t7,$t7,$t0 ; ap[0]*bp[i]+tp[0]
571 mulld $t7,$t7,$n0 ; tp[0]*n0
573 ; transfer b[i] to FPU as 4x16-bit values
578 std $t0,`$FRAME+0`($sp)
579 std $t1,`$FRAME+8`($sp)
580 std $t2,`$FRAME+16`($sp)
581 std $t3,`$FRAME+24`($sp)
582 lfd $ba,`$FRAME+0`($sp)
583 lfd $bb,`$FRAME+8`($sp)
584 lfd $bc,`$FRAME+16`($sp)
585 lfd $bd,`$FRAME+24`($sp)
591 ; transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
596 std $t4,`$FRAME+32`($sp)
597 std $t5,`$FRAME+40`($sp)
598 std $t6,`$FRAME+48`($sp)
599 std $t7,`$FRAME+56`($sp)
600 lfd $na,`$FRAME+32`($sp)
601 lfd $nb,`$FRAME+40`($sp)
602 lfd $nc,`$FRAME+48`($sp)
603 lfd $nd,`$FRAME+56`($sp)
609 addi $tp,$sp,`$FRAME+$TRANSFER`
610 fsub $dota,$dota,$dota
611 fsub $dotb,$dotb,$dotb
615 lfdu $A0,8($ap_l) ; load a[j] in double format
617 lfdu $N0,8($np_l) ; load n[j] in double format
619 lfdu $A2,8($ap_l) ; load a[j+1] in double format
621 lfdu $N2,8($np_l) ; load n[j+1] in double format
624 fmadd $T0a,$A0,$ba,$dota
625 fmadd $T0b,$A0,$bb,$dotb
633 fmadd $T1a,$A0,$bc,$T1a
634 fmadd $T1b,$A0,$bd,$T1b
635 fmadd $T2a,$A1,$bc,$T2a
636 fmadd $T2b,$A1,$bd,$T2b
637 fmadd $T3a,$A2,$bc,$T3a
638 fmadd $T3b,$A2,$bd,$T3b
642 fmadd $T0a,$N0,$na,$T0a
643 fmadd $T0b,$N0,$nb,$T0b
644 fmadd $T1a,$N1,$na,$T1a
645 fmadd $T1b,$N1,$nb,$T1b
646 fmadd $T2a,$N2,$na,$T2a
647 fmadd $T2b,$N2,$nb,$T2b
648 fmadd $T3a,$N3,$na,$T3a
649 fmadd $T3b,$N3,$nb,$T3b
651 fmadd $T1a,$N0,$nc,$T1a
652 fmadd $T1b,$N0,$nd,$T1b
653 fmadd $T2a,$N1,$nc,$T2a
654 fmadd $T2b,$N1,$nd,$T2b
655 fmadd $T3a,$N2,$nc,$T3a
656 fmadd $T3b,$N2,$nd,$T3b
657 fmadd $dota,$N3,$nc,$dota
658 fmadd $dotb,$N3,$nd,$dotb
669 stfd $T0a,`$FRAME+0`($sp)
670 stfd $T0b,`$FRAME+8`($sp)
671 stfd $T1a,`$FRAME+16`($sp)
672 stfd $T1b,`$FRAME+24`($sp)
673 stfd $T2a,`$FRAME+32`($sp)
674 stfd $T2b,`$FRAME+40`($sp)
675 stfd $T3a,`$FRAME+48`($sp)
676 stfd $T3b,`$FRAME+56`($sp)
679 ld $t0,`$FRAME+0`($sp)
680 ld $t1,`$FRAME+8`($sp)
681 ld $t2,`$FRAME+16`($sp)
682 ld $t3,`$FRAME+24`($sp)
683 ld $t4,`$FRAME+32`($sp)
684 ld $t5,`$FRAME+40`($sp)
685 ld $t6,`$FRAME+48`($sp)
686 ld $t7,`$FRAME+56`($sp)
688 add $t0,$t0,$carry ; can not overflow
706 insrdi $t0,$t3,16,0 ; 0..63 bits
709 insrdi $t4,$t7,16,0 ; 64..127 bits
710 srdi $carry,$t7,16 ; upper 33 bits
712 ld $t1,8($tp) ; tp[j]
713 ldu $t2,16($tp) ; tp[j+1]
719 std $t3,-16($tp) ; tp[j-1]
720 std $t5,-8($tp) ; tp[j]
722 lfdu $A0,8($ap_l) ; load a[j] in double format
724 lfdu $N0,8($np_l) ; load n[j] in double format
726 lfdu $A2,8($ap_l) ; load a[j+1] in double format
728 lfdu $N2,8($np_l) ; load n[j+1] in double format
731 fmadd $T0a,$A0,$ba,$dota
732 fmadd $T0b,$A0,$bb,$dotb
740 fmadd $T1a,$A0,$bc,$T1a
741 fmadd $T1b,$A0,$bd,$T1b
742 fmadd $T2a,$A1,$bc,$T2a
743 fmadd $T2b,$A1,$bd,$T2b
744 fmadd $T3a,$A2,$bc,$T3a
745 fmadd $T3b,$A2,$bd,$T3b
749 fmadd $T0a,$N0,$na,$T0a
750 fmadd $T0b,$N0,$nb,$T0b
751 fmadd $T1a,$N1,$na,$T1a
752 fmadd $T1b,$N1,$nb,$T1b
753 fmadd $T2a,$N2,$na,$T2a
754 fmadd $T2b,$N2,$nb,$T2b
755 fmadd $T3a,$N3,$na,$T3a
756 fmadd $T3b,$N3,$nb,$T3b
758 fmadd $T1a,$N0,$nc,$T1a
759 fmadd $T1b,$N0,$nd,$T1b
760 fmadd $T2a,$N1,$nc,$T2a
761 fmadd $T2b,$N1,$nd,$T2b
762 fmadd $T3a,$N2,$nc,$T3a
763 fmadd $T3b,$N2,$nd,$T3b
764 fmadd $dota,$N3,$nc,$dota
765 fmadd $dotb,$N3,$nd,$dotb
776 stfd $T0a,`$FRAME+0`($sp)
777 stfd $T0b,`$FRAME+8`($sp)
778 stfd $T1a,`$FRAME+16`($sp)
779 stfd $T1b,`$FRAME+24`($sp)
780 stfd $T2a,`$FRAME+32`($sp)
781 stfd $T2b,`$FRAME+40`($sp)
782 stfd $T3a,`$FRAME+48`($sp)
783 stfd $T3b,`$FRAME+56`($sp)
786 ld $t0,`$FRAME+0`($sp)
787 ld $t1,`$FRAME+8`($sp)
788 ld $t2,`$FRAME+16`($sp)
789 ld $t3,`$FRAME+24`($sp)
790 ld $t4,`$FRAME+32`($sp)
791 ld $t5,`$FRAME+40`($sp)
792 ld $t6,`$FRAME+48`($sp)
793 ld $t7,`$FRAME+56`($sp)
795 add $t0,$t0,$carry ; can not overflow
813 insrdi $t0,$t3,16,0 ; 0..63 bits
816 insrdi $t4,$t7,16,0 ; 64..127 bits
817 srdi $carry,$t7,16 ; upper 33 bits
819 ld $t1,8($tp) ; tp[j]
820 ldu $t2,16($tp) ; tp[j+1]
826 std $t3,-16($tp) ; tp[j-1]
827 std $t5,-8($tp) ; tp[j]
831 stfd $dota,`$FRAME+0`($sp)
832 stfd $dotb,`$FRAME+8`($sp)
833 ld $t0,`$FRAME+0`($sp)
834 ld $t1,`$FRAME+8`($sp)
835 add $carry,$carry,$ovf ; comsume upmost overflow
836 add $t0,$t0,$carry ; can not overflow
841 std $t0,0($tp) ; tp[num-1]
843 subf $ap_l,$num,$ap_l ; rewind pointers
844 subf $ap_h,$num,$ap_h
845 subf $np_l,$num,$np_l
846 subf $np_h,$num,$np_h
851 subf $np,$num,$np ; rewind np
852 subfc $i,$i,$i ; j=0 and "clear" XER[CA]
853 addi $tp,$sp,`$FRAME+$TRANSFER+8`
854 addi $t4,$sp,`$FRAME+$TRANSFER+16`
865 subfe $t0,$t1,$t0 ; tp[j]-np[j]
866 subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1]
873 subfe $ovf,$i,$ovf ; handle upmost overflow bit
876 or $ap,$ap,$np ; ap=borrow?tp:rp
881 Lcopy: ; copy or in-place refresh
884 stdu $i,8($ap_l) ; zap {an}p_{lh}
894 stdx $i,$tp,$i ; zap tp at once
899 $POP r14,`2*$SIZE_T`($sp)
900 $POP r15,`3*$SIZE_T`($sp)
901 $POP r16,`4*$SIZE_T`($sp)
902 $POP r17,`5*$SIZE_T`($sp)
903 $POP r18,`6*$SIZE_T`($sp)
904 $POP r19,`7*$SIZE_T`($sp)
905 $POP r20,`8*$SIZE_T`($sp)
906 $POP r21,`9*$SIZE_T`($sp)
907 $POP r22,`10*$SIZE_T`($sp)
908 $POP r23,`11*$SIZE_T`($sp)
909 $POP r24,`12*$SIZE_T`($sp)
910 $POP r25,`13*$SIZE_T`($sp)
911 $POP r26,`14*$SIZE_T`($sp)
912 $POP r27,`15*$SIZE_T`($sp)
913 lfd f14,`16*$SIZE_T+0`($sp)
914 lfd f15,`16*$SIZE_T+8`($sp)
915 lfd f16,`16*$SIZE_T+16`($sp)
916 lfd f17,`16*$SIZE_T+24`($sp)
917 lfd f18,`16*$SIZE_T+32`($sp)
918 lfd f19,`16*$SIZE_T+40`($sp)
919 lfd f20,`16*$SIZE_T+48`($sp)
920 lfd f21,`16*$SIZE_T+56`($sp)
921 lfd f22,`16*$SIZE_T+64`($sp)
922 lfd f23,`16*$SIZE_T+72`($sp)
923 lfd f24,`16*$SIZE_T+80`($sp)
924 lfd f25,`16*$SIZE_T+88`($sp)
926 li r3,1 ; signal "handled"
929 .asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
932 $code =~ s/\`([^\`]*)\`/eval $1/gem;