2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # "Teaser" Montgomery multiplication module for IA-64. There are
20 # several possibilities for improvement:
22 # - modulo-scheduling outer loop would eliminate quite a number of
23 # stalls after ldf8, xma and getf.sig outside inner loop and
24 # improve shorter key performance;
25 # - shorter vector support [with input vectors being fetched only
26 # once] should be added;
27 # - 2x unroll with help of n0[1] would make the code scalable on
28 # "wider" IA-64, "wider" than Itanium 2 that is, which is not of
29 # acute interest, because upcoming Tukwila's individual cores are
30 # reportedly based on Itanium 2 design;
31 # - dedicated squaring procedure(?);
35 # Shorter vector support is implemented by zero-padding ap and np
36 # vectors up to 8 elements, or 512 bits. This means that 256-bit
37 # inputs will be processed only 2 times faster than 512-bit inputs,
38 # not 4 [as one would expect, because algorithm complexity is n^2].
39 # The reason for padding is that inputs shorter than 512 bits won't
40 # be processed faster anyway, because minimal critical path of the
41 # core loop happens to match 512-bit timing. Either way, it resulted
42 # in >100% improvement of 512-bit RSA sign benchmark and 50% - of
43 # 1024-bit one [in comparison to original version of *this* module].
45 # So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
47 # sign verify sign/s verify/s
48 # rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4
49 # rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0
50 # rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0
51 # rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6
52 # dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0
53 # dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4
54 # dsa 2048 bits 0.001453s 0.001703s 688.1 587.4
56 # ... and *without* (but still with ia64.S):
58 # rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5
59 # rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3
60 # rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9
61 # rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9
62 # dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6
63 # dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2
64 # dsa 2048 bits 0.001894s 0.002179s 528.1 458.9
66 # As it can be seen, RSA sign performance improves by 130-30%,
67 # hereafter less for longer keys, while verify - by 74-13%.
68 # DSA performance improves by 115-30%.
74 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
75 } else { $ADDP="add"; }
81 // int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
82 // const BN_ULONG *bp,const BN_ULONG *np,
83 // const BN_ULONG *n0p,int num);
90 { .mmi; cmp4.le p6,p7=2,r37;;
91 (p6) cmp4.lt.unc p8,p9=8,r37
94 (p9) br.cond.dptk.many bn_mul_mont_8
95 (p8) br.cond.dpnt.many bn_mul_mont_general
96 (p7) br.ret.spnt.many b0 };;
99 prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11;
101 rptr=r8; aptr=r9; bptr=r14; nptr=r15;
104 num=r18; len=r19; lc=r20;
105 topbit=r21; // carry bit from tmp[num]
112 .local bn_mul_mont_general#
113 .proc bn_mul_mont_general#
116 { .mmi; .save ar.pfs,prevfs
117 alloc prevfs=ar.pfs,6,2,0,8
121 { .mmi; .vframe prevsp
128 .rotf alo[6],nlo[4],ahi[8],nhi[6]
131 { .mmi; ldf8 bi=[bptr],8 // (*bp++)
132 ldf8 alo[4]=[aptr],16 // ap[0]
134 { .mmi; ldf8 alo[3]=[r30],16 // ap[1]
135 ldf8 alo[2]=[aptr],16 // ap[2]
137 { .mmi; ldf8 alo[1]=[r30] // ap[3]
140 { .mmi; $ADDP nptr=0,in3
143 { .mmi; ldf8 nlo[2]=[nptr],8 // np[0]
145 shladd r31=num,3,r31 };;
146 { .mmi; ldf8 nlo[1]=[nptr],8 // np[1]
149 { .mfb; and sp=-16,r31 // alloca
150 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0]
153 xmpy.lu alo[4]=alo[4],bi
154 brp.loop.imp .L1st_ctop,.L1st_cend-16
157 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
160 xma.lu alo[3]=alo[3],bi,ahi[2]
161 mov pr.rot=0x20001f<<16
162 // ------^----- (p40) at first (p23)
163 // ----------^^ p[16:20]=1
166 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0
169 fcvt.fxu.s1 nhi[1]=f0
174 .pred.rel "mutex",p40,p42
175 { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
176 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
177 (p40) add n[2]=n[2],a[2] } // (p23) }
178 { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16)
179 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
180 (p42) add n[2]=n[2],a[2],1 };; // (p23)
181 { .mfi; (p21) getf.sig a[0]=alo[5]
182 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
183 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23)
184 { .mfi; (p23) st8 [tp_1]=n[2],8
185 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
186 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
187 { .mmb; (p21) getf.sig n[0]=nlo[3]
189 br.ctop.sptk .L1st_ctop };;
192 { .mmi; getf.sig a[0]=ahi[6] // (p24)
194 add num=-1,num };; // num--
195 { .mmi; .pred.rel "mutex",p40,p42
196 (p40) add n[0]=n[0],a[0]
197 (p42) add n[0]=n[0],a[0],1
198 sub aptr=aptr,len };; // rewind
199 { .mmi; .pred.rel "mutex",p40,p42
200 (p40) cmp.ltu p41,p39=n[0],a[0]
201 (p42) cmp.leu p41,p39=n[0],a[0]
202 sub nptr=nptr,len };;
203 { .mmi; .pred.rel "mutex",p39,p41
204 (p39) add topbit=r0,r0
205 (p41) add topbit=r0,r0,1
207 { .mmi; st8 [tp_1]=n[0]
212 { .mmi; ldf8 bi=[bptr],8 // (*bp++)
213 ldf8 ahi[3]=[tptr] // tp[0]
215 { .mmi; ldf8 alo[4]=[aptr],16 // ap[0]
216 ldf8 alo[3]=[r30],16 // ap[1]
218 { .mfb; ldf8 alo[2]=[aptr],16 // ap[2]
219 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
220 brp.loop.imp .Linner_ctop,.Linner_cend-16
222 { .mfb; ldf8 alo[1]=[r30] // ap[3]
223 xma.lu alo[4]=alo[4],bi,ahi[3]
225 { .mfi; ldf8 nlo[2]=[nptr],16 // np[0]
226 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
228 { .mfi; ldf8 nlo[1]=[r31] // np[1]
229 xma.lu alo[3]=alo[3],bi,ahi[2]
230 mov pr.rot=0x20101f<<16
231 // ------^----- (p40) at first (p23)
232 // --------^--- (p30) at first (p22)
233 // ----------^^ p[16:20]=1
235 { .mfi; st8 [tptr]=r0 // tp[0] is already accounted
236 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0
239 fcvt.fxu.s1 nhi[1]=f0
242 // This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
243 // 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
244 // in latter case accounts for two-tick pipeline stall, which means
245 // that its performance would be ~20% lower than optimal one. No
246 // attempt was made to address this, because original Itanium is
247 // hardly represented out in the wild...
250 .pred.rel "mutex",p40,p42
251 .pred.rel "mutex",p30,p32
252 { .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++)
253 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1]
254 (p40) add n[2]=n[2],a[2] } // (p23)
255 { .mfi; (p16) nop.m 0
256 (p18) xma.lu alo[2]=alo[2],bi,ahi[1]
257 (p42) add n[2]=n[2],a[2],1 };; // (p23)
258 { .mfi; (p21) getf.sig a[0]=alo[5]
260 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23)
261 { .mfi; (p21) ld8 t[0]=[tptr],8
263 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23)
264 { .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)
265 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1]
266 (p30) add a[1]=a[1],t[1] } // (p22)
267 { .mfi; (p16) nop.m 0
268 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1]
269 (p32) add a[1]=a[1],t[1],1 };; // (p22)
270 { .mmi; (p21) getf.sig n[0]=nlo[3]
272 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22)
273 { .mmb; (p23) st8 [tp_1]=n[2],8
274 (p32) cmp.leu p31,p29=a[1],t[1] // (p22)
275 br.ctop.sptk .Linner_ctop };;
278 { .mmi; getf.sig a[0]=ahi[6] // (p24)
282 { .mmi; .pred.rel "mutex",p31,p33
283 (p31) add a[0]=a[0],topbit
284 (p33) add a[0]=a[0],topbit,1
286 { .mfi; .pred.rel "mutex",p31,p33
287 (p31) cmp.ltu p32,p30=a[0],topbit
288 (p33) cmp.leu p32,p30=a[0],topbit
290 { .mfi; .pred.rel "mutex",p40,p42
291 (p40) add n[0]=n[0],a[0]
292 (p42) add n[0]=n[0],a[0],1
294 { .mmi; .pred.rel "mutex",p44,p46
295 (p40) cmp.ltu p41,p39=n[0],a[0]
296 (p42) cmp.leu p41,p39=n[0],a[0]
297 (p32) add topbit=r0,r0,1 }
299 { .mmi; st8 [tp_1]=n[0],8
301 sub aptr=aptr,len };; // rewind
302 { .mmi; sub nptr=nptr,len
303 (p41) add topbit=r0,r0,1
305 { .mmb; add tp_1=8,sp
306 add num=-1,num // num--
307 (p6) br.cond.sptk.many .Louter };;
310 brp.loop.imp .Lsub_ctop,.Lsub_cend-16
313 mov pr.rot=0x10001<<16
314 // ------^---- (p33) at first (p17)
321 .pred.rel "mutex",p33,p35
322 { .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++)
324 (p33) sub n[1]=t[1],n[1] } // (p17)
325 { .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++)
327 (p35) sub n[1]=t[1],n[1],1 };; // (p17)
328 { .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r
329 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17)
331 { .mib; (p18) nop.m 0
332 (p35) cmp.geu p34,p32=n[1],t[1] // (p17)
333 br.ctop.sptk .Lsub_ctop };;
336 { .mmb; .pred.rel "mutex",p34,p36
337 (p34) sub topbit=topbit,r0 // (p19)
338 (p36) sub topbit=topbit,r0,1
339 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16
341 { .mmb; sub rptr=rptr,len // rewind
344 { .mmi; and aptr=tptr,topbit
345 andcm bptr=rptr,topbit
347 { .mii; or nptr=aptr,bptr
352 { .mmb; (p16) ld8 n[0]=[nptr],8
353 (p18) st8 [tptr]=r0,8
355 { .mmb; (p16) nop.m 0
356 (p18) st8 [rptr]=n[2],8
357 br.ctop.sptk .Lcopy_ctop };;
360 { .mmi; mov ret0=1 // signal "handled"
361 rum 1<<5 // clear um.mfh
365 mov pr=prevpr,0x1ffff
366 br.ret.sptk.many b0 };;
367 .endp bn_mul_mont_general#
369 a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23;
370 n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31;
373 ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
374 ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
377 .skip 48 // aligns loop body
378 .local bn_mul_mont_8#
382 { .mmi; .save ar.pfs,prevfs
383 alloc prevfs=ar.pfs,6,2,0,8
388 { .mmi; add r17=-6*16,sp
393 { .mmi; .save.gf 0,0x10
394 stf.spill [sp]=f16,-16
396 stf.spill [r17]=f17,32
397 add r16=-5*16,prevsp};;
398 { .mmi; .save.gf 0,0x40
399 stf.spill [r16]=f18,32
401 stf.spill [r17]=f19,32
403 { .mmi; .save.gf 0,0x100
404 stf.spill [r16]=f20,32
406 stf.spill [r17]=f21,32
408 { .mmi; .save.gf 0,0x400
415 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
418 // load input vectors padding them to 8 elements
419 { .mmi; ldf8 ai0=[aptr],16 // ap[0]
420 ldf8 ai1=[r29],16 // ap[1]
422 { .mmi; $ADDP r30=8,in2
425 { .mmi; ldf8 bj[7]=[bptr],16 // bp[0]
426 ldf8 bj[6]=[r30],16 // bp[1]
427 cmp4.le p4,p5=3,in5 }
428 { .mmi; ldf8 ni0=[nptr],16 // np[0]
429 ldf8 ni1=[r31],16 // np[1]
430 cmp4.le p6,p7=4,in5 };;
432 { .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2]
434 cmp4.le p8,p9=5,in5 }
435 { .mfi; (p6)ldf8 ai3=[r29],16 // ap[3]
437 cmp4.le p10,p11=6,in5 }
438 { .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2]
439 (p5)fcvt.fxu bj[5]=f0
440 cmp4.le p12,p13=7,in5 }
441 { .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3]
442 (p7)fcvt.fxu bj[4]=f0
443 cmp4.le p14,p15=8,in5 }
444 { .mfi; (p4)ldf8 ni2=[nptr],16 // np[2]
447 { .mfi; (p6)ldf8 ni3=[r31],16 // np[3]
451 { .mfi; ldf8 n0=[in4]
455 { .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4]
458 { .mfi; (p10)ldf8 ai5=[r29],16 // ap[5]
461 { .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4]
462 (p9)fcvt.fxu bj[3]=f0
464 { .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5]
465 (p11)fcvt.fxu bj[2]=f0
467 { .mfi; (p8)ldf8 ni4=[nptr],16 // np[4]
470 { .mfi; (p10)ldf8 ni5=[r31],16 // np[5]
474 { .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6]
477 { .mfi; (p14)ldf8 ai7=[r29],16 // ap[7]
480 { .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6]
481 (p13)fcvt.fxu bj[1]=f0
483 { .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7]
484 (p15)fcvt.fxu bj[0]=f0
486 { .mfi; (p12)ldf8 ni6=[nptr],16 // np[6]
489 { .mfb; (p14)ldf8 ni7=[r31],16 // np[7]
491 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16
494 // The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
495 // to measure with help of Interval Time Counter indicated that the
496 // factor is a tad higher: 33 or 34, if not 35. Exact measurement and
497 // addressing the issue is problematic, because I don't have access
498 // to platform-specific instruction-level profiler. On Itanium it
499 // should run in 56*n ticks, because of higher xma latency...
501 .pred.rel "mutex",p40,p42
502 .pred.rel "mutex",p48,p50
503 { .mfi; (p16) nop.m 0 // 0:
504 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0]
505 (p40) add a3=a3,n3 } // (p17) a3+=n3
506 { .mfi; (p42) add a3=a3,n3,1
507 (p16) xma.lu alo[0]=ai0,bj[7],tf[1]
509 { .mii; (p17) getf.sig a7=alo[8] // 1:
510 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
511 (p50) add t[6]=t[6],a3,1 };;
512 { .mfi; (p17) getf.sig a8=ahi[8] // 2:
513 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
514 (p40) cmp.ltu p43,p41=a3,n3 }
515 { .mfi; (p42) cmp.leu p43,p41=a3,n3
516 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
518 { .mii; (p17) getf.sig n5=nlo[6] // 3:
519 (p48) cmp.ltu p51,p49=t[6],a3
520 (p50) cmp.leu p51,p49=t[6],a3 };;
521 .pred.rel "mutex",p41,p43
522 .pred.rel "mutex",p49,p51
523 { .mfi; (p16) nop.m 0 // 4:
524 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i]
525 (p41) add a4=a4,n4 } // (p17) a4+=n4
526 { .mfi; (p43) add a4=a4,n4,1
527 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0]
529 { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
530 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0
531 (p51) add t[5]=t[5],a4,1 };;
532 { .mfi; (p16) nop.m 0 // 6:
533 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
534 (p41) cmp.ltu p42,p40=a4,n4 }
535 { .mfi; (p43) cmp.leu p42,p40=a4,n4
536 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
538 { .mii; (p17) getf.sig n6=nlo[7] // 7:
539 (p49) cmp.ltu p50,p48=t[5],a4
540 (p51) cmp.leu p50,p48=t[5],a4 };;
541 .pred.rel "mutex",p40,p42
542 .pred.rel "mutex",p48,p50
543 { .mfi; (p16) nop.m 0 // 8:
544 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i]
545 (p40) add a5=a5,n5 } // (p17) a5+=n5
546 { .mfi; (p42) add a5=a5,n5,1
547 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1]
549 { .mii; (p16) getf.sig a1=alo[1] // 9:
550 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
551 (p50) add t[4]=t[4],a5,1 };;
552 { .mfi; (p16) nop.m 0 // 10:
553 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0
554 (p40) cmp.ltu p43,p41=a5,n5 }
555 { .mfi; (p42) cmp.leu p43,p41=a5,n5
556 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0]
558 { .mii; (p17) getf.sig n7=nlo[8] // 11:
559 (p48) cmp.ltu p51,p49=t[4],a5
560 (p50) cmp.leu p51,p49=t[4],a5 };;
561 .pred.rel "mutex",p41,p43
562 .pred.rel "mutex",p49,p51
563 { .mfi; (p17) getf.sig n8=nhi[8] // 12:
564 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i]
565 (p41) add a6=a6,n6 } // (p17) a6+=n6
566 { .mfi; (p43) add a6=a6,n6,1
567 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2]
569 { .mii; (p16) getf.sig a2=alo[2] // 13:
570 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
571 (p51) add t[3]=t[3],a6,1 };;
572 { .mfi; (p16) nop.m 0 // 14:
573 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0
574 (p41) cmp.ltu p42,p40=a6,n6 }
575 { .mfi; (p43) cmp.leu p42,p40=a6,n6
576 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0]
578 { .mii; (p16) nop.m 0 // 15:
579 (p49) cmp.ltu p50,p48=t[3],a6
580 (p51) cmp.leu p50,p48=t[3],a6 };;
581 .pred.rel "mutex",p40,p42
582 .pred.rel "mutex",p48,p50
583 { .mfi; (p16) nop.m 0 // 16:
584 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i]
585 (p40) add a7=a7,n7 } // (p17) a7+=n7
586 { .mfi; (p42) add a7=a7,n7,1
587 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3]
589 { .mii; (p16) getf.sig a3=alo[3] // 17:
590 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
591 (p50) add t[2]=t[2],a7,1 };;
592 { .mfi; (p16) nop.m 0 // 18:
593 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0
594 (p40) cmp.ltu p43,p41=a7,n7 }
595 { .mfi; (p42) cmp.leu p43,p41=a7,n7
596 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1]
598 { .mii; (p16) getf.sig n1=nlo[1] // 19:
599 (p48) cmp.ltu p51,p49=t[2],a7
600 (p50) cmp.leu p51,p49=t[2],a7 };;
601 .pred.rel "mutex",p41,p43
602 .pred.rel "mutex",p49,p51
603 { .mfi; (p16) nop.m 0 // 20:
604 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i]
605 (p41) add a8=a8,n8 } // (p17) a8+=n8
606 { .mfi; (p43) add a8=a8,n8,1
607 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4]
609 { .mii; (p16) getf.sig a4=alo[4] // 21:
610 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
611 (p51) add t[1]=t[1],a8,1 };;
612 { .mfi; (p16) nop.m 0 // 22:
613 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0
614 (p41) cmp.ltu p42,p40=a8,n8 }
615 { .mfi; (p43) cmp.leu p42,p40=a8,n8
616 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2]
618 { .mii; (p16) getf.sig n2=nlo[2] // 23:
619 (p49) cmp.ltu p50,p48=t[1],a8
620 (p51) cmp.leu p50,p48=t[1],a8 };;
621 { .mfi; (p16) nop.m 0 // 24:
622 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i]
623 (p16) add a1=a1,n1 } // (p16) a1+=n1
624 { .mfi; (p16) nop.m 0
625 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5]
626 (p17) mov t[0]=r0 };;
627 { .mii; (p16) getf.sig a5=alo[5] // 25:
628 (p16) add t0=t[7],a1 // (p16) t[7]+=a1
629 (p42) add t[0]=t[0],r0,1 };;
630 { .mfi; (p16) setf.sig tf[0]=t0 // 26:
631 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0
632 (p50) add t[0]=t[0],r0,1 }
633 { .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1
634 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3]
636 { .mii; (p16) getf.sig n3=nlo[3] // 27:
637 (p16) cmp.ltu.unc p50,p48=t0,a1
639 .pred.rel "mutex",p40,p42
640 .pred.rel "mutex",p48,p50
641 { .mfi; (p16) nop.m 0 // 28:
642 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i]
643 (p40) add a2=a2,n2 } // (p16) a2+=n2
644 { .mfi; (p42) add a2=a2,n2,1
645 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6]
647 { .mii; (p16) getf.sig a6=alo[6] // 29:
648 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2
649 (p50) add t[6]=t[6],a2,1 };;
650 { .mfi; (p16) nop.m 0 // 30:
651 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0
652 (p40) cmp.ltu p41,p39=a2,n2 }
653 { .mfi; (p42) cmp.leu p41,p39=a2,n2
654 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4]
656 { .mfi; (p16) getf.sig n4=nlo[4] // 31:
658 (p48) cmp.ltu p49,p47=t[6],a2 }
659 { .mfb; (p50) cmp.leu p49,p47=t[6],a2
661 br.ctop.sptk.many .Louter_8_ctop };;
664 // above loop has to execute one more time, without (p16), which is
665 // replaced with merged move of np[8] to GPR bank
666 .pred.rel "mutex",p40,p42
667 .pred.rel "mutex",p48,p50
668 { .mmi; (p0) getf.sig n1=ni0 // 0:
669 (p40) add a3=a3,n3 // (p17) a3+=n3
670 (p42) add a3=a3,n3,1 };;
671 { .mii; (p17) getf.sig a7=alo[8] // 1:
672 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3
673 (p50) add t[6]=t[6],a3,1 };;
674 { .mfi; (p17) getf.sig a8=ahi[8] // 2:
675 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0
676 (p40) cmp.ltu p43,p41=a3,n3 }
677 { .mfi; (p42) cmp.leu p43,p41=a3,n3
678 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6]
680 { .mii; (p17) getf.sig n5=nlo[6] // 3:
681 (p48) cmp.ltu p51,p49=t[6],a3
682 (p50) cmp.leu p51,p49=t[6],a3 };;
683 .pred.rel "mutex",p41,p43
684 .pred.rel "mutex",p49,p51
685 { .mmi; (p0) getf.sig n2=ni1 // 4:
686 (p41) add a4=a4,n4 // (p17) a4+=n4
687 (p43) add a4=a4,n4,1 };;
688 { .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4
690 (p51) add t[5]=t[5],a4,1 };;
691 { .mfi; (p0) getf.sig n3=ni2 // 6:
692 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0
693 (p41) cmp.ltu p42,p40=a4,n4 }
694 { .mfi; (p43) cmp.leu p42,p40=a4,n4
695 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7]
697 { .mii; (p17) getf.sig n6=nlo[7] // 7:
698 (p49) cmp.ltu p50,p48=t[5],a4
699 (p51) cmp.leu p50,p48=t[5],a4 };;
700 .pred.rel "mutex",p40,p42
701 .pred.rel "mutex",p48,p50
702 { .mii; (p0) getf.sig n4=ni3 // 8:
703 (p40) add a5=a5,n5 // (p17) a5+=n5
704 (p42) add a5=a5,n5,1 };;
705 { .mii; (p0) nop.m 0 // 9:
706 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5
707 (p50) add t[4]=t[4],a5,1 };;
708 { .mii; (p0) nop.m 0 // 10:
709 (p40) cmp.ltu p43,p41=a5,n5
710 (p42) cmp.leu p43,p41=a5,n5 };;
711 { .mii; (p17) getf.sig n7=nlo[8] // 11:
712 (p48) cmp.ltu p51,p49=t[4],a5
713 (p50) cmp.leu p51,p49=t[4],a5 };;
714 .pred.rel "mutex",p41,p43
715 .pred.rel "mutex",p49,p51
716 { .mii; (p17) getf.sig n8=nhi[8] // 12:
717 (p41) add a6=a6,n6 // (p17) a6+=n6
718 (p43) add a6=a6,n6,1 };;
719 { .mii; (p0) getf.sig n5=ni4 // 13:
720 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6
721 (p51) add t[3]=t[3],a6,1 };;
722 { .mii; (p0) nop.m 0 // 14:
723 (p41) cmp.ltu p42,p40=a6,n6
724 (p43) cmp.leu p42,p40=a6,n6 };;
725 { .mii; (p0) getf.sig n6=ni5 // 15:
726 (p49) cmp.ltu p50,p48=t[3],a6
727 (p51) cmp.leu p50,p48=t[3],a6 };;
728 .pred.rel "mutex",p40,p42
729 .pred.rel "mutex",p48,p50
730 { .mii; (p0) nop.m 0 // 16:
731 (p40) add a7=a7,n7 // (p17) a7+=n7
732 (p42) add a7=a7,n7,1 };;
733 { .mii; (p0) nop.m 0 // 17:
734 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7
735 (p50) add t[2]=t[2],a7,1 };;
736 { .mii; (p0) nop.m 0 // 18:
737 (p40) cmp.ltu p43,p41=a7,n7
738 (p42) cmp.leu p43,p41=a7,n7 };;
739 { .mii; (p0) getf.sig n7=ni6 // 19:
740 (p48) cmp.ltu p51,p49=t[2],a7
741 (p50) cmp.leu p51,p49=t[2],a7 };;
742 .pred.rel "mutex",p41,p43
743 .pred.rel "mutex",p49,p51
744 { .mii; (p0) nop.m 0 // 20:
745 (p41) add a8=a8,n8 // (p17) a8+=n8
746 (p43) add a8=a8,n8,1 };;
747 { .mmi; (p0) nop.m 0 // 21:
748 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8
749 (p51) add t[1]=t[1],a8,1 }
750 { .mmi; (p17) mov t[0]=r0
751 (p41) cmp.ltu p42,p40=a8,n8
752 (p43) cmp.leu p42,p40=a8,n8 };;
753 { .mmi; (p0) getf.sig n8=ni7 // 22:
754 (p49) cmp.ltu p50,p48=t[1],a8
755 (p51) cmp.leu p50,p48=t[1],a8 }
756 { .mmi; (p42) add t[0]=t[0],r0,1
757 (p0) add r16=-7*16,prevsp
758 (p0) add r17=-6*16,prevsp };;
760 // subtract np[8] from carrybit|tmp[8]
761 // carrybit|tmp[8] layout upon exit from above loop is:
762 // t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
763 { .mmi; (p50)add t[0]=t[0],r0,1
766 { .mmi; cmp.gtu p34,p32=n1,t0;;
767 .pred.rel "mutex",p32,p34
769 (p34)sub n2=t[7],n2,1 };;
770 { .mii; (p32)cmp.gtu p35,p33=n2,t[7]
771 (p34)cmp.geu p35,p33=n2,t[7];;
772 .pred.rel "mutex",p33,p35
773 (p33)sub n3=t[6],n3 }
774 { .mmi; (p35)sub n3=t[6],n3,1;;
775 (p33)cmp.gtu p34,p32=n3,t[6]
776 (p35)cmp.geu p34,p32=n3,t[6] };;
777 .pred.rel "mutex",p32,p34
778 { .mii; (p32)sub n4=t[5],n4
779 (p34)sub n4=t[5],n4,1;;
780 (p32)cmp.gtu p35,p33=n4,t[5] }
781 { .mmi; (p34)cmp.geu p35,p33=n4,t[5];;
782 .pred.rel "mutex",p33,p35
784 (p35)sub n5=t[4],n5,1 };;
785 { .mii; (p33)cmp.gtu p34,p32=n5,t[4]
786 (p35)cmp.geu p34,p32=n5,t[4];;
787 .pred.rel "mutex",p32,p34
788 (p32)sub n6=t[3],n6 }
789 { .mmi; (p34)sub n6=t[3],n6,1;;
790 (p32)cmp.gtu p35,p33=n6,t[3]
791 (p34)cmp.geu p35,p33=n6,t[3] };;
792 .pred.rel "mutex",p33,p35
793 { .mii; (p33)sub n7=t[2],n7
794 (p35)sub n7=t[2],n7,1;;
795 (p33)cmp.gtu p34,p32=n7,t[2] }
796 { .mmi; (p35)cmp.geu p34,p32=n7,t[2];;
797 .pred.rel "mutex",p32,p34
799 (p34)sub n8=t[1],n8,1 };;
800 { .mii; (p32)cmp.gtu p35,p33=n8,t[1]
801 (p34)cmp.geu p35,p33=n8,t[1];;
802 .pred.rel "mutex",p33,p35
803 (p33)sub a8=t[0],r0 }
804 { .mmi; (p35)sub a8=t[0],r0,1;;
805 (p33)cmp.gtu p34,p32=a8,t[0]
806 (p35)cmp.geu p34,p32=a8,t[0] };;
808 // save the result, either tmp[num] or tmp[num]-np[num]
809 .pred.rel "mutex",p32,p34
810 { .mmi; (p32)st8 [rptr]=n1,8
812 add r19=-4*16,prevsp};;
813 { .mmb; (p32)st8 [rptr]=n2,8
814 (p34)st8 [rptr]=t[7],8
815 (p5)br.cond.dpnt.few .Ldone };;
816 { .mmb; (p32)st8 [rptr]=n3,8
817 (p34)st8 [rptr]=t[6],8
818 (p7)br.cond.dpnt.few .Ldone };;
819 { .mmb; (p32)st8 [rptr]=n4,8
820 (p34)st8 [rptr]=t[5],8
821 (p9)br.cond.dpnt.few .Ldone };;
822 { .mmb; (p32)st8 [rptr]=n5,8
823 (p34)st8 [rptr]=t[4],8
824 (p11)br.cond.dpnt.few .Ldone };;
825 { .mmb; (p32)st8 [rptr]=n6,8
826 (p34)st8 [rptr]=t[3],8
827 (p13)br.cond.dpnt.few .Ldone };;
828 { .mmb; (p32)st8 [rptr]=n7,8
829 (p34)st8 [rptr]=t[2],8
830 (p15)br.cond.dpnt.few .Ldone };;
831 { .mmb; (p32)st8 [rptr]=n8,8
832 (p34)st8 [rptr]=t[1],8
835 { .mmi; ldf.fill f16=[r16],64
836 ldf.fill f17=[r17],64
838 { .mmi; ldf.fill f18=[r18],64
839 ldf.fill f19=[r19],64
840 mov pr=prevpr,0x1ffff };;
841 { .mmi; ldf.fill f20=[r16]
844 { .mmi; ldf.fill f22=[r18]
846 mov ret0=1 } // signal "handled"
850 br.ret.sptk.many b0 };;
853 .type copyright#,\@object
855 stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
858 open STDOUT,">$output" if $output;