From: Andy Polyakov Date: Sat, 29 Dec 2007 20:26:46 +0000 (+0000) Subject: New Montgomery multiplication module, ppc64-mont.pl. Reference, non-optimized X-Git-Tag: OpenSSL_0_9_8k^2~598 X-Git-Url: https://git.librecmc.org/?a=commitdiff_plain;h=64214a218369e4a811e0541de27ba618a31a61c2;p=oweals%2Fopenssl.git New Montgomery multiplication module, ppc64-mont.pl. Reference, non-optimized implementation. This is essentially informational commit. --- diff --git a/crypto/bn/asm/ppc64-mont.pl b/crypto/bn/asm/ppc64-mont.pl new file mode 100644 index 0000000000..0c54378f74 --- /dev/null +++ b/crypto/bn/asm/ppc64-mont.pl @@ -0,0 +1,690 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# December 2007 + +$output = shift; + +if ($output =~ /32\-mont\.s/) { + $SIZE_T=4; + $RZONE= 224; + $FRAME= $SIZE_T*16+8*12; + $fname= "bn_mul_mont_ppc64"; + + $STUX= "stwux"; # store indexed and update + $PUSH= "stw"; + $POP= "lwz"; + die "not implemented yet"; +} elsif ($output =~ /64\-mont\.s/) { + $SIZE_T=8; + $RZONE= 288; + $FRAME= $SIZE_T*16+8*12; + $fname= "bn_mul_mont"; + + # same as above, but 64-bit mnemonics... + $STUX= "stdux"; # store indexed and update + $PUSH= "std"; + $POP= "ld"; +} else { die "nonsense $output"; } + +( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) || + die "can't call ../perlasm/ppc-xlate.pl: $!"; + +$TRANSFER=8*8; + +$sp="r1"; +$toc="r2"; +$rp="r3"; $ovf="r3"; +$ap="r4"; +$bp="r5"; +$np="r6"; +$n0="r7"; +$num="r8"; +$rp="r9"; # $rp is reassigned +$tp="r10"; +$j="r11"; +$i="r12"; +# non-volatile registers +$ap_l="r14"; +$ap_h="r15"; +$np_l="r16"; +$np_h="r17"; +$carry="r18"; +$a0="r19"; # ap[0] +$t0="r20"; +$t1="r21"; +$t2="r22"; +$t3="r23"; +$t4="r24"; +$t5="r25"; +$t6="r26"; +$t7="r27"; + +# PPC offers enough register bank capacity to unroll inner loops twice +# +# ..A3A2A1A0 +# dcba +# ----------- +# A0a +# A0b +# A0c +# A0d +# A1a +# A1b +# A1c +# A1d +# A2a +# A2b +# A2c +# A2d +# A3a +# A3b +# A3c +# A3d +# ..a +# ..b +# +$ba="f0"; +$bb="f1"; +$bc="f2"; +$bd="f3"; +$na="f4"; +$nb="f5"; +$nc="f6"; +$nd="f7"; +$dota="f8"; +$dotb="f9"; +$A0="f10"; +$A1="f11"; +$A2="f12"; +$A3="f13"; +$N0="f14"; +$N1="f15"; +$N2="f16"; +$N3="f17"; +$T0a="f18"; +$T0b="f19"; +$T1a="f20"; +$T1b="f21"; +$T2a="f22"; +$T2b="f23"; +$T3a="f24"; +$T3b="f25"; + +# sp----------->+-------------------------------+ +# | saved sp | +# +-------------------------------+ +# | | +# +-------------------------------+ +# | 14 saved gpr, r14-r27 | +# . . +# . . +# +16*size_t +-------------------------------+ +# | 12 saved fpr, f14-f25 | +# . . +# . . +# +12*8 +-------------------------------+ +# | 8 gpr<->fpr transfer zone | +# . . +# . . +# +8*8 +-------------------------------+ +# | __int64 tmp[-1] | +# +-------------------------------+ +# | __int64 tmp[num] | +# . . +# . . +# . . +# +(num+1)*8 +-------------------------------+ +# | double a_lo[num] | +# . . +# . . +# . . +# +num*8 +-------------------------------+ +# | double a_hi[num] | +# . . +# . . +# . . +# +num*8 +-------------------------------+ +# | double n_lo[num] | +# . . +# . . +# . . +# +num*8 +-------------------------------+ +# | double n_hi[num] | +# . . +# . . +# . . +# +-------------------------------+ + +$code=<<___; +.machine "any" +.text + +.globl .$fname +.align 4 +.$fname: + cmpwi $num,4 + mr $rp,r3 ; $rp is reassigned + li r3,0 ; possible "not handled" return code + bltlr- + andi. r0,$num,1 ; $num has to be even + bnelr- + + slwi $num,$num,3 ; num*=8 + li $i,-4096 + slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num + add $tp,$tp,$num ; place for tp[num+1] + addi $tp,$tp,`$FRAME+$TRANSFER+8+$RZONE` + subf $tp,$tp,$sp ; $sp-$tp + and $tp,$tp,$i ; minimize TLB usage + subf $tp,$sp,$tp ; $tp-$sp + $STUX $sp,$sp,$tp ; alloca + + $PUSH r14,`2*$SIZE_T`($sp) + $PUSH r15,`3*$SIZE_T`($sp) + $PUSH r16,`4*$SIZE_T`($sp) + $PUSH r17,`5*$SIZE_T`($sp) + $PUSH r18,`6*$SIZE_T`($sp) + $PUSH r19,`7*$SIZE_T`($sp) + $PUSH r20,`8*$SIZE_T`($sp) + $PUSH r21,`9*$SIZE_T`($sp) + $PUSH r22,`10*$SIZE_T`($sp) + $PUSH r23,`11*$SIZE_T`($sp) + $PUSH r24,`12*$SIZE_T`($sp) + $PUSH r25,`13*$SIZE_T`($sp) + $PUSH r26,`14*$SIZE_T`($sp) + $PUSH r27,`15*$SIZE_T`($sp) + stfd f14,`16*$SIZE_T+0`($sp) + stfd f15,`16*$SIZE_T+8`($sp) + stfd f16,`16*$SIZE_T+16`($sp) + stfd f17,`16*$SIZE_T+24`($sp) + stfd f18,`16*$SIZE_T+32`($sp) + stfd f19,`16*$SIZE_T+40`($sp) + stfd f20,`16*$SIZE_T+48`($sp) + stfd f21,`16*$SIZE_T+56`($sp) + stfd f22,`16*$SIZE_T+64`($sp) + stfd f23,`16*$SIZE_T+72`($sp) + stfd f24,`16*$SIZE_T+80`($sp) + stfd f25,`16*$SIZE_T+88`($sp) + std r0,$FRAME($sp) ; r0 is still 0 + lfd $dota,$FRAME($sp) + lfd $dotb,$FRAME($sp) + + addi $tp,$sp,`$FRAME+$TRANSFER` + ; note that {an}p_{lh} are off by 1, this is because they + ; are used with stfdu/lfdu instruction... + add $ap_l,$tp,$num + add $ap_h,$ap_l,$num + add $np_l,$ap_h,$num + add $np_h,$np_l,$num + + ld $a0,0($ap) ; pull ap[0] value + ld $n0,0($n0) ; pull n0[0] value + srwi $j,$num,`3+1` ; counter register, num/2 + + ld $t3,0($bp) ; bp[0] + mulld $t7,$a0,$t3 ; ap[0]*bp[0] + mulld $t7,$t7,$n0 ; tp[0]*n0 + + ; transfer bp[0] to FPU as 4x16-bit values + extrdi $t0,$t3,16,48 + extrdi $t1,$t3,16,32 + extrdi $t2,$t3,16,16 + extrdi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + lfd $ba,`$FRAME+0`($sp) + lfd $bb,`$FRAME+8`($sp) + lfd $bc,`$FRAME+16`($sp) + lfd $bd,`$FRAME+24`($sp) + fcfid $ba,$ba + fcfid $bb,$bb + fcfid $bc,$bc + fcfid $bd,$bd + + ; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values + extrdi $t4,$t7,16,48 + extrdi $t5,$t7,16,32 + extrdi $t6,$t7,16,16 + extrdi $t7,$t7,16,0 + std $t4,`$FRAME+32`($sp) + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + lfd $na,`$FRAME+32`($sp) + lfd $nb,`$FRAME+40`($sp) + lfd $nc,`$FRAME+48`($sp) + lfd $nd,`$FRAME+56`($sp) + fcfid $na,$na + fcfid $nb,$nb + fcfid $nc,$nc + fcfid $nd,$nd + + addi $tp,$sp,`$FRAME+$TRANSFER-8` + li $carry,0 + mtctr $j +.align 4 +L1st: + lwz $t0,4($ap) ; load a[j] as 32-bit word pair + lwz $t1,0($ap) + lwz $t2,4($np) ; load n[j] as 32-bit word pair + lwz $t3,0($np) + std $t0,`$FRAME+0`($sp) + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + lfd $A0,`$FRAME+0`($sp) + lfd $A1,`$FRAME+8`($sp) + lfd $N0,`$FRAME+16`($sp) + lfd $N1,`$FRAME+24`($sp) + fcfid $A0,$A0 + fcfid $A1,$A1 + fcfid $N0,$N0 + fcfid $N1,$N1 + stfdu $A0,8($ap_l) ; save a[j] in double format + stfdu $A1,8($ap_h) + stfdu $N0,8($np_l) ; save n[j] in double format + stfdu $N1,8($np_h) + + lwz $t4,12($ap) ; load a[j+1] as 32-bit word pair + lwz $t5,8($ap) + lwz $t6,12($np) ; load n[j+1] as 32-bit word pair + lwz $t7,8($np) + std $t4,`$FRAME+32`($sp) + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + lfd $A2,`$FRAME+32`($sp) + lfd $A3,`$FRAME+40`($sp) + lfd $N2,`$FRAME+48`($sp) + lfd $N3,`$FRAME+56`($sp) + fcfid $A2,$A2 + fcfid $A3,$A3 + fcfid $N2,$N2 + fcfid $N3,$N3 + stfdu $A2,8($ap_l) ; save a[j+1] in double format + stfdu $A3,8($ap_h) + stfdu $N2,8($np_l) ; save n[j+1] in double format + stfdu $N3,8($np_h) + addi $ap,$ap,16 + addi $np,$np,16 + + fmadd $T0a,$A0,$ba,$dota + fmadd $T0b,$A0,$bb,$dotb + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + + fctid $T0a,$T0a + fctid $T0b,$T0b + fctid $T1a,$T1a + fctid $T1b,$T1b + fctid $T2a,$T2a + fctid $T2b,$T2b + fctid $T3a,$T3a + fctid $T3b,$T3b + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + add $t2,$t2,$carry + srdi $carry,$t2,16 + add $t3,$t3,$carry + srdi $carry,$t3,16 + add $t4,$t4,$carry + srdi $carry,$t4,16 + add $t5,$t5,$carry + srdi $carry,$t5,16 + add $t6,$t6,$carry + srdi $carry,$t6,16 + add $t7,$t7,$carry + + insrdi $t0,$t1,16,32 + insrdi $t0,$t2,16,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + insrdi $t4,$t5,16,32 + insrdi $t4,$t6,16,16 + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + + std $t0,8($tp) ; tp[j-1] + stdu $t4,16($tp) ; tp[j] + bdnz- L1st + + fctid $dota,$dota + fctid $dotb,$dotb + stfd $dota,`$FRAME+0`($sp) + stfd $dotb,`$FRAME+8`($sp) + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + insrdi $t0,$t1,48,0 + srdi $ovf,$t1,48 + std $t0,8($tp) ; tp[num-1] + + subf $ap_l,$num,$ap_l ; rewind pointers + subf $ap_h,$num,$ap_h + subf $np_l,$num,$np_l + subf $np_h,$num,$np_h + + li $i,8 ; i=1 +.align 4 +Louter: + ldx $t3,$bp,$i ; bp[i] + ld $t0,`$FRAME+$TRANSFER+8`($sp) ; tp[0] + mulld $t7,$a0,$t3 ; ap[0]*bp[i] + add $t7,$t7,$t0 ; ap[0]*bp[i]+tp[0] + mulld $t7,$t7,$n0 ; tp[0]*n0 + + ; transfer b[i] to FPU as 4x16-bit values + extrdi $t0,$t3,16,48 + extrdi $t1,$t3,16,32 + extrdi $t2,$t3,16,16 + extrdi $t3,$t3,16,0 + std $t0,`$FRAME+0`($sp) + std $t1,`$FRAME+8`($sp) + std $t2,`$FRAME+16`($sp) + std $t3,`$FRAME+24`($sp) + lfd $ba,`$FRAME+0`($sp) + lfd $bb,`$FRAME+8`($sp) + lfd $bc,`$FRAME+16`($sp) + lfd $bd,`$FRAME+24`($sp) + fcfid $ba,$ba + fcfid $bb,$bb + fcfid $bc,$bc + fcfid $bd,$bd + + ; transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values + extrdi $t4,$t7,16,48 + extrdi $t5,$t7,16,32 + extrdi $t6,$t7,16,16 + extrdi $t7,$t7,16,0 + std $t4,`$FRAME+32`($sp) + std $t5,`$FRAME+40`($sp) + std $t6,`$FRAME+48`($sp) + std $t7,`$FRAME+56`($sp) + lfd $na,`$FRAME+32`($sp) + lfd $nb,`$FRAME+40`($sp) + lfd $nc,`$FRAME+48`($sp) + lfd $nd,`$FRAME+56`($sp) + fcfid $na,$na + fcfid $nb,$nb + fcfid $nc,$nc + fcfid $nd,$nd + + addi $tp,$sp,`$FRAME+$TRANSFER` + fsub $dota,$dota,$dota + fsub $dotb,$dotb,$dotb + li $carry,0 + mtctr $j +.align 4 +Linner: + lfdu $A0,8($ap_l) ; load a[j] in double format + lfdu $A1,8($ap_h) + lfdu $N0,8($np_l) ; load n[j] in double format + lfdu $N1,8($np_h) + lfdu $A2,8($ap_l) ; load a[j+1] in double format + lfdu $A3,8($ap_h) + lfdu $N2,8($np_l) ; load n[j+1] in double format + lfdu $N3,8($np_h) + + fmadd $T0a,$A0,$ba,$dota + fmadd $T0b,$A0,$bb,$dotb + fmul $T1a,$A1,$ba + fmul $T1b,$A1,$bb + fmul $T2a,$A2,$ba + fmul $T2b,$A2,$bb + fmul $T3a,$A3,$ba + fmul $T3b,$A3,$bb + + fmadd $T1a,$A0,$bc,$T1a + fmadd $T1b,$A0,$bd,$T1b + fmadd $T2a,$A1,$bc,$T2a + fmadd $T2b,$A1,$bd,$T2b + fmadd $T3a,$A2,$bc,$T3a + fmadd $T3b,$A2,$bd,$T3b + fmul $dota,$A3,$bc + fmul $dotb,$A3,$bd + + fmadd $T0a,$N0,$na,$T0a + fmadd $T0b,$N0,$nb,$T0b + fmadd $T1a,$N1,$na,$T1a + fmadd $T1b,$N1,$nb,$T1b + fmadd $T2a,$N2,$na,$T2a + fmadd $T2b,$N2,$nb,$T2b + fmadd $T3a,$N3,$na,$T3a + fmadd $T3b,$N3,$nb,$T3b + + fmadd $T1a,$N0,$nc,$T1a + fmadd $T1b,$N0,$nd,$T1b + fmadd $T2a,$N1,$nc,$T2a + fmadd $T2b,$N1,$nd,$T2b + fmadd $T3a,$N2,$nc,$T3a + fmadd $T3b,$N2,$nd,$T3b + fmadd $dota,$N3,$nc,$dota + fmadd $dotb,$N3,$nd,$dotb + + fctid $T0a,$T0a + fctid $T0b,$T0b + fctid $T1a,$T1a + fctid $T1b,$T1b + fctid $T2a,$T2a + fctid $T2b,$T2b + fctid $T3a,$T3a + fctid $T3b,$T3b + + stfd $T0a,`$FRAME+0`($sp) + stfd $T0b,`$FRAME+8`($sp) + stfd $T1a,`$FRAME+16`($sp) + stfd $T1b,`$FRAME+24`($sp) + stfd $T2a,`$FRAME+32`($sp) + stfd $T2b,`$FRAME+40`($sp) + stfd $T3a,`$FRAME+48`($sp) + stfd $T3b,`$FRAME+56`($sp) + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + ld $t2,`$FRAME+16`($sp) + ld $t3,`$FRAME+24`($sp) + ld $t4,`$FRAME+32`($sp) + ld $t5,`$FRAME+40`($sp) + ld $t6,`$FRAME+48`($sp) + ld $t7,`$FRAME+56`($sp) + + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + srdi $carry,$t1,16 + add $t2,$t2,$carry + srdi $carry,$t2,16 + add $t3,$t3,$carry + srdi $carry,$t3,16 + add $t4,$t4,$carry + srdi $carry,$t4,16 + add $t5,$t5,$carry + srdi $carry,$t5,16 + add $t6,$t6,$carry + srdi $carry,$t6,16 + add $t7,$t7,$carry + + insrdi $t0,$t1,16,32 + insrdi $t0,$t2,16,16 + insrdi $t0,$t3,16,0 ; 0..63 bits + insrdi $t4,$t5,16,32 + insrdi $t4,$t6,16,16 + insrdi $t4,$t7,16,0 ; 64..127 bits + srdi $carry,$t7,16 ; upper 33 bits + + ld $t1,8($tp) ; tp[j] + ldu $t2,16($tp) ; tp[j+1] + + addc $t3,$t0,$t1 + adde $t5,$t4,$t2 + addze $carry,$carry + + std $t3,-16($tp) ; tp[j-1] + std $t5,-8($tp) ; tp[j] + bdnz- Linner + + fctid $dota,$dota + fctid $dotb,$dotb + stfd $dota,`$FRAME+0`($sp) + stfd $dotb,`$FRAME+8`($sp) + ld $t0,`$FRAME+0`($sp) + ld $t1,`$FRAME+8`($sp) + add $carry,$carry,$ovf ; comsume upmost overflow + add $t0,$t0,$carry ; can not overflow + srdi $carry,$t0,16 + add $t1,$t1,$carry + insrdi $t0,$t1,48,0 + srdi $ovf,$t1,48 + std $t0,0($tp) ; tp[num-1] + + subf $ap_l,$num,$ap_l ; rewind pointers + subf $ap_h,$num,$ap_h + subf $np_l,$num,$np_l + subf $np_h,$num,$np_h + addi $i,$i,8 + cmpw $i,$num + blt- Louter + + subf $np,$num,$np ; rewind np + subfc $i,$i,$i ; j=0 and "clear" XER[CA] + addi $tp,$sp,`$FRAME+$TRANSFER+8` + addi $t4,$sp,`$FRAME+$TRANSFER+16` + addi $t5,$np,8 + addi $t6,$rp,8 + mtctr $j + +.align 4 +Lsub: ldx $t0,$tp,$i + ldx $t1,$np,$i + ldx $t2,$t4,$i + ldx $t3,$t5,$i + subfe $t0,$t1,$t0 ; tp[j]-np[j] + subfe $t2,$t3,$t2 ; tp[j+1]-np[j+1] + stdx $t0,$rp,$i + stdx $t2,$t6,$i + addi $i,$i,16 + bdnz- Lsub + + li $i,0 + subfe $ovf,$i,$ovf ; handle upmost overflow bit + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp + addi $t7,$ap,8 + mtctr $j + +.align 4 +Lcopy: ; copy or in-place refresh + ldx $t0,$ap,$i + ldx $t1,$t7,$i + stdu $i,8($ap_l) ; zap {an}p_{lh} + stdu $i,8($ap_h) + stdu $i,8($np_l) + stdu $i,8($np_h) + stdu $i,8($ap_l) + stdu $i,8($ap_h) + stdu $i,8($np_l) + stdu $i,8($np_h) + stdx $t0,$rp,$i + stdx $t1,$t6,$i + stdx $i,$tp,$i ; zap tp at once + stdx $i,$t4,$i + addi $i,$i,16 + bdnz- Lcopy + + $POP r14,`2*$SIZE_T`($sp) + $POP r15,`3*$SIZE_T`($sp) + $POP r16,`4*$SIZE_T`($sp) + $POP r17,`5*$SIZE_T`($sp) + $POP r18,`6*$SIZE_T`($sp) + $POP r19,`7*$SIZE_T`($sp) + $POP r20,`8*$SIZE_T`($sp) + $POP r21,`9*$SIZE_T`($sp) + $POP r22,`10*$SIZE_T`($sp) + $POP r23,`11*$SIZE_T`($sp) + $POP r24,`12*$SIZE_T`($sp) + $POP r25,`13*$SIZE_T`($sp) + $POP r26,`14*$SIZE_T`($sp) + $POP r27,`15*$SIZE_T`($sp) + lfd f14,`16*$SIZE_T+0`($sp) + lfd f15,`16*$SIZE_T+8`($sp) + lfd f16,`16*$SIZE_T+16`($sp) + lfd f17,`16*$SIZE_T+24`($sp) + lfd f18,`16*$SIZE_T+32`($sp) + lfd f19,`16*$SIZE_T+40`($sp) + lfd f20,`16*$SIZE_T+48`($sp) + lfd f21,`16*$SIZE_T+56`($sp) + lfd f22,`16*$SIZE_T+64`($sp) + lfd f23,`16*$SIZE_T+72`($sp) + lfd f24,`16*$SIZE_T+80`($sp) + lfd f25,`16*$SIZE_T+88`($sp) + $POP $sp,0($sp) + li r3,1 ; signal "handled" + blr + .long 0 +.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT;