1 .ident "sparcv8plus.s, Version 1.3"
2 .ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
12 * ====================================================================
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in UltraSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
21 * Questions-n-answers.
24 * A. With SC4.x/SC5.x:
26 * cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
30 * gcc -mcpu=ultrasparc -c bn_asm.sparc.v8plus.S -o bn_asm.o
32 * or if above fails (it does if you have gas installed):
34 * gcc -E bn_asm.sparc.v8plus.S | as -xarch=v8plus /dev/fd/0 -o bn_asm.o
36 * Quick-n-dirty way to fuse the module into the library.
37 * Provided that the library is already configured and built
38 * (in 0.9.2 case with no_asm option):
41 * # cp /some/place/bn_asm.sparc.v8plus.S .
42 * # cc -xarch=v8plus -c bn_asm.sparc.v8plus.S -o bn_asm.o
47 * Quick-n-dirty way to get rid of it:
55 * Q. V8plus achitecture? What kind of beast is that?
56 * A. Well, it's rather a programming model than an architecture...
57 * It's actually v9-compliant, i.e. *any* UltraSPARC, CPU under
58 * special conditions, namely when kernel doesn't preserve upper
59 * 32 bits of otherwise 64-bit registers during a context switch.
61 * Q. Why just UltraSPARC? What about SuperSPARC?
62 * A. Original release did target UltraSPARC only. Now SuperSPARC
63 * version is provided along. Both version share bn_*comba[48]
64 * implementations (see comment later in code for explanation).
65 * But what's so special about this UltraSPARC implementation?
66 * Why didn't I let compiler do the job? Trouble is that most of
67 * available compilers (well, SC5.0 is the only exception) don't
68 * attempt to take advantage of UltraSPARC's 64-bitness under
69 * 32-bit kernels even though it's perfectly possible (see next
72 * Q. 64-bit registers under 32-bit kernels? Didn't you just say it
74 * A. You can't adress *all* registers as 64-bit wide:-( The catch is
75 * that you actually may rely upon %o0-%o5 and %g1-%g4 being fully
76 * preserved if you're in a leaf function, i.e. such never calling
77 * any other functions. All functions in this module are leaf and
78 * 10 registers is a handful. And as a matter of fact none-"comba"
79 * routines don't require even that much and I could even afford to
80 * not allocate own stack frame for 'em:-)
82 * Q. What about 64-bit kernels?
83 * A. What about 'em? Just kidding:-) Pure 64-bit version is currently
84 * under evaluation and development...
86 * Q. What about shared libraries?
87 * A. What about 'em? Kidding again:-) Code does *not* contain any
88 * code position dependencies and it's safe to include it into
89 * shared library as is.
91 * Q. How much faster does it go?
92 * A. Do you have a good benchmark? In either case below is what I
93 * experience with crypto/bn/expspeed.c test program:
95 * v8plus module on U10/300MHz against bn_asm.c compiled with:
97 * cc-5.0 -xarch=v8plus -xO5 -xdepend +7-12%
98 * cc-4.2 -xarch=v8plus -xO5 -xdepend +25-35%
99 * egcs-1.1.2 -mcpu=ultrasparc -O3 +35-45%
101 * v8 module on SS10/60MHz against bn_asm.c compiled with:
103 * cc-5.0 -xarch=v8 -xO5 -xdepend +7-10%
104 * cc-4.2 -xarch=v8 -xO5 -xdepend +10%
105 * egcs-1.1.2 -mv8 -O3 +35-45%
107 * As you can see it's damn hard to beat the new Sun C compiler
108 * and it's in first place GNU C users who will appreciate this
109 * assembler implementation:-)
115 * 1.0 - initial release;
116 * 1.1 - new loop unrolling model(*);
117 * - some more fine tuning;
118 * 1.2 - made gas friendly;
119 * - updates to documentation concerning v9;
120 * - new performance comparison matrix;
121 * 1.3 - fixed problem with /usr/ccs/lib/cpp;
123 * (*) Originally unrolled loop looked like this:
125 * op(p+0); if (--n==0) break;
126 * op(p+1); if (--n==0) break;
127 * op(p+2); if (--n==0) break;
128 * op(p+3); if (--n==0) break;
131 * I unroll according to following:
133 * op(p+0); op(p+1); op(p+2); op(p+3);
137 * op(p+0); if (--n==0) return;
138 * op(p+2); if (--n==0) return;
143 .section ".text",#alloc,#execinstr
144 .file "bn_asm.sparc.v8plus.S"
148 .global bn_mul_add_words
150 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
156 brgz,a %o2,.L_bn_mul_add_words_proceed
161 .L_bn_mul_add_words_proceed:
164 bz,pn %icc,.L_bn_mul_add_words_tail
167 .L_bn_mul_add_words_loop: ! wow! 32 aligned!
203 bnz,a,pt %icc,.L_bn_mul_add_words_loop
206 brnz,a,pn %o2,.L_bn_mul_add_words_tail
208 .L_bn_mul_add_words_return:
212 .L_bn_mul_add_words_tail:
219 brz,pt %o2,.L_bn_mul_add_words_return
229 brz,pt %o2,.L_bn_mul_add_words_return
241 .type bn_mul_add_words,#function
242 .size bn_mul_add_words,(.-bn_mul_add_words)
248 * BN_ULONG bn_mul_words(rp,ap,num,w)
254 brgz,a %o2,.L_bn_mul_words_proceeed
259 .L_bn_mul_words_proceeed:
262 bz,pn %icc,.L_bn_mul_words_tail
265 .L_bn_mul_words_loop: ! wow! 32 aligned!
294 bnz,a,pt %icc,.L_bn_mul_words_loop
298 brnz,a,pn %o2,.L_bn_mul_words_tail
300 .L_bn_mul_words_return:
304 .L_bn_mul_words_tail:
309 brz,pt %o2,.L_bn_mul_words_return
317 brz,pt %o2,.L_bn_mul_words_return
327 .type bn_mul_words,#function
328 .size bn_mul_words,(.-bn_mul_words)
333 * void bn_sqr_words(r,a,n)
338 brgz,a %o2,.L_bn_sqr_words_proceeed
343 .L_bn_sqr_words_proceeed:
346 bz,pn %icc,.L_bn_sqr_words_tail
349 .L_bn_sqr_words_loop: ! wow! 32 aligned!
378 bnz,a,pt %icc,.L_bn_sqr_words_loop
382 brnz,a,pn %o2,.L_bn_sqr_words_tail
384 .L_bn_sqr_words_return:
388 .L_bn_sqr_words_tail:
393 brz,pt %o2,.L_bn_sqr_words_return
401 brz,pt %o2,.L_bn_sqr_words_return
412 .type bn_sqr_words,#function
413 .size bn_sqr_words,(.-bn_sqr_words)
418 * BN_ULONG bn_div_words(h,l,d)
428 .type bn_div_words,#function
429 .size bn_div_words,(.-bn_div_words)
435 * BN_ULONG bn_add_words(rp,ap,bp,n)
436 * BN_ULONG *rp,*ap,*bp;
440 brgz,a %o3,.L_bn_add_words_proceed
445 .L_bn_add_words_proceed:
447 bz,pn %icc,.L_bn_add_words_tail
448 addcc %g0,0,%g0 ! clear carry flag
455 ba .L_bn_add_words_warm_loop
459 .L_bn_add_words_loop: ! wow! 32 aligned!
467 .L_bn_add_words_warm_loop:
488 brnz,a,pt %g1,.L_bn_add_words_loop
491 brnz,a,pn %o3,.L_bn_add_words_tail
493 .L_bn_add_words_return:
499 .L_bn_add_words_tail: ! wow! 32 aligned!
503 brz,pt %o3,.L_bn_add_words_return
510 brz,pt %o3,.L_bn_add_words_return
523 .type bn_add_words,#function
524 .size bn_add_words,(.-bn_add_words)
528 * BN_ULONG bn_sub_words(rp,ap,bp,n)
529 * BN_ULONG *rp,*ap,*bp;
533 brgz,a %o3,.L_bn_sub_words_proceed
538 .L_bn_sub_words_proceed:
540 bz,pn %icc,.L_bn_sub_words_tail
541 addcc %g0,0,%g0 ! clear carry flag
548 ba .L_bn_sub_words_warm_loop
552 .L_bn_sub_words_loop: ! wow! 32 aligned!
560 .L_bn_sub_words_warm_loop:
581 brnz,a,pt %g1,.L_bn_sub_words_loop
584 brnz,a,pn %o3,.L_bn_sub_words_tail
586 .L_bn_sub_words_return:
592 .L_bn_sub_words_tail: ! wow! 32 aligned!
596 brz,pt %o3,.L_bn_sub_words_return
603 brz,pt %o3,.L_bn_sub_words_return
616 .type bn_sub_words,#function
617 .size bn_sub_words,(.-bn_sub_words)
620 * The following code is pure SPARC V8! Trouble is that it's not feasible
621 * to implement the mumbo-jumbo in less "V9" instructions:-( At least not
622 * under 32-bit kernel. The reason is that you'd have to shuffle registers
623 * all the time as only few (well, 10:-) are fully (i.e. all 64 bits)
624 * preserved by kernel during context switch. But even under 64-bit kernel
625 * you won't gain much because in the lack of "add with extended carry"
626 * instruction you'd have to issue 'clr %rx; movcs %xcc,1,%rx;
627 * add %rd,%rx,%rd' sequence in place of 'addxcc %rx,%ry,%rx;
628 * addx %rz,%g0,%rz' pair in 32-bit case. Well, 'bpcs,a %xcc,.+8; inc %rd'
629 * is another alternative...
634 #define FRAME_SIZE -96
637 * Here is register usage map for *all* routines below.
645 #define ap(I) [%i1+4*I]
646 #define bp(I) [%i2+4*I]
647 #define rp(I) [%i0+4*I]
668 .global bn_mul_comba8
670 * void bn_mul_comba8(r,a,b)
674 save %sp,FRAME_SIZE,%sp
677 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
680 st c_1,rp(0) !r[0]=c1;
682 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
686 addxcc %g0,t_2,c_3 !=
689 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
693 st c_2,rp(1) !r[1]=c2;
696 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
699 addxcc c_1,t_2,c_1 !=
702 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
708 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
711 addxcc c_1,t_2,c_1 !=
713 st c_3,rp(2) !r[2]=c3;
715 umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
720 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
726 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
732 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
737 st c_1,rp(3) !r[3]=c1;
739 umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
744 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
749 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
755 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
761 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
766 st c_2,rp(4) !r[4]=c2;
768 umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
773 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
778 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2);
783 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
786 addxcc c_1,t_2,c_1 !=
789 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
795 umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
798 addxcc c_1,t_2,c_1 !=
800 st c_3,rp(5) !r[5]=c3;
802 umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
807 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
812 umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3);
815 addxcc c_2,t_2,c_2 !=
817 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
822 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3);
828 umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
831 addxcc c_2,t_2,c_2 !=
834 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
838 st c_1,rp(6) !r[6]=c1;
841 umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
844 addxcc c_3,t_2,c_3 !=
846 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
851 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
856 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1);
861 umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
864 addxcc c_3,t_2,c_3 !=
866 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
872 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
877 umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1);
880 addxcc c_3,t_2,c_3 !=
882 st c_2,rp(7) !r[7]=c2;
884 umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
889 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2);
894 umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
897 addxcc c_1,t_2,c_1 !=
899 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
904 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
909 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2);
914 umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
919 st c_3,rp(8) !r[8]=c3;
921 umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
926 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3);
931 umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
934 addxcc c_2,t_2,c_2 !=
936 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
941 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
946 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3);
951 st c_1,rp(9) !r[9]=c1;
953 umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
958 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
963 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1);
968 umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
971 addxcc c_3,t_2,c_3 !=
973 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
978 st c_2,rp(10) !r[10]=c2;
980 umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
985 umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
988 addxcc c_1,t_2,c_1 !=
990 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
995 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
999 st c_3,rp(11) !r[11]=c3;
1002 umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
1005 addxcc c_2,t_2,c_2 !=
1007 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
1012 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
1013 addcc c_1,t_1,c_1 !=
1016 st c_1,rp(12) !r[12]=c1;
1019 umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
1022 addxcc c_3,t_2,c_3 !=
1024 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
1029 st c_2,rp(13) !r[13]=c2;
1031 umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
1036 st c_3,rp(14) !r[14]=c3;
1037 st c_1,rp(15) !r[15]=c1;
1042 .type bn_mul_comba8,#function
1043 .size bn_mul_comba8,(.-bn_mul_comba8)
1047 .global bn_mul_comba4
1049 * void bn_mul_comba4(r,a,b)
1050 * BN_ULONG *r,*a,*b;
1053 save %sp,FRAME_SIZE,%sp
1056 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
1059 st c_1,rp(0) !r[0]=c1;
1061 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
1068 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
1073 st c_2,rp(1) !r[1]=c2;
1075 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
1081 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
1087 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1092 st c_3,rp(2) !r[2]=c3;
1094 umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
1099 umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1102 addxcc c_2,t_2,c_2 !=
1105 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1106 addcc c_1,t_1,c_1 !=
1110 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);
1115 st c_1,rp(3) !r[3]=c1;
1117 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1122 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1123 addcc c_2,t_1,c_2 !=
1127 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1);
1132 st c_2,rp(4) !r[4]=c2;
1134 umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1139 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1140 addcc c_3,t_1,c_3 !=
1143 st c_3,rp(5) !r[5]=c3;
1146 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1149 addxcc c_2,t_2,c_2 !=
1150 st c_1,rp(6) !r[6]=c1;
1151 st c_2,rp(7) !r[7]=c2;
1156 .type bn_mul_comba4,#function
1157 .size bn_mul_comba4,(.-bn_mul_comba4)
1161 .global bn_sqr_comba8
1163 save %sp,FRAME_SIZE,%sp
1166 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
1168 st c_1,rp(0) !r[0]=c1;
1171 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1178 st c_2,rp(1) !r[1]=c2;
1181 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1184 addxcc c_1,t_2,c_1 !=
1190 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1195 st c_3,rp(2) !r[2]=c3;
1197 umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
1206 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1209 addxcc c_2,t_2,c_2 !=
1214 st c_1,rp(3) !r[3]=c1;
1216 umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1222 addxcc c_3,t_2,c_3 !=
1224 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1230 addxcc c_3,t_2,c_3 !=
1233 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1234 addcc c_2,t_1,c_2 !=
1237 st c_2,rp(4) !r[4]=c2;
1240 umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1243 addxcc c_1,t_2,c_1 !=
1248 umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1251 addxcc c_1,t_2,c_1 !=
1257 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1263 addxcc c_1,t_2,c_1 !=
1265 st c_3,rp(5) !r[5]=c3;
1267 umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1268 addcc c_1,t_1,c_1 !=
1272 addcc c_1,t_1,c_1 !=
1275 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1276 addcc c_1,t_1,c_1 !=
1280 addcc c_1,t_1,c_1 !=
1283 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1284 addcc c_1,t_1,c_1 !=
1288 addcc c_1,t_1,c_1 !=
1292 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1297 st c_1,rp(6) !r[6]=c1;
1299 umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1305 addxcc c_3,t_2,c_3 !=
1307 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1313 addxcc c_3,t_2,c_3 !=
1315 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1321 addxcc c_3,t_2,c_3 !=
1323 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1329 addxcc c_3,t_2,c_3 !=
1331 st c_2,rp(7) !r[7]=c2;
1333 umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1334 addcc c_3,t_1,c_3 !=
1338 addcc c_3,t_1,c_3 !=
1341 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1342 addcc c_3,t_1,c_3 !=
1346 addcc c_3,t_1,c_3 !=
1349 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1350 addcc c_3,t_1,c_3 !=
1354 addcc c_3,t_1,c_3 !=
1357 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1358 addcc c_3,t_1,c_3 !=
1361 st c_3,rp(8) !r[8]=c3;
1364 umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1367 addxcc c_2,t_2,c_2 !=
1372 umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1375 addxcc c_2,t_2,c_2 !=
1380 umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1383 addxcc c_2,t_2,c_2 !=
1388 st c_1,rp(9) !r[9]=c1;
1390 umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1396 addxcc c_3,t_2,c_3 !=
1398 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1404 addxcc c_3,t_2,c_3 !=
1406 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1411 st c_2,rp(10) !r[10]=c2;
1413 umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
1421 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2);
1428 st c_3,rp(11) !r[11]=c3;
1431 umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1434 addxcc c_2,t_2,c_2 !=
1439 umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1442 addxcc c_2,t_2,c_2 !=
1444 st c_1,rp(12) !r[12]=c1;
1446 umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1447 addcc c_2,t_1,c_2 !=
1451 addcc c_2,t_1,c_2 !=
1454 st c_2,rp(13) !r[13]=c2;
1457 umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1460 addxcc c_1,t_2,c_1 !=
1461 st c_3,rp(14) !r[14]=c3;
1462 st c_1,rp(15) !r[15]=c1;
1467 .type bn_sqr_comba8,#function
1468 .size bn_sqr_comba8,(.-bn_sqr_comba8)
1472 .global bn_sqr_comba4
1474 * void bn_sqr_comba4(r,a)
1478 save %sp,FRAME_SIZE,%sp
1480 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
1483 st c_1,rp(0) !r[0]=c1;
1486 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1495 st c_2,rp(1) !r[1]=c2;
1497 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1503 addxcc c_1,t_2,c_1 !=
1506 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1507 addcc c_3,t_1,c_3 !=
1510 st c_3,rp(2) !r[2]=c3;
1513 umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1516 addxcc c_2,t_2,c_2 !=
1521 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1524 addxcc c_2,t_2,c_2 !=
1529 st c_1,rp(3) !r[3]=c1;
1531 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1537 addxcc c_3,t_2,c_3 !=
1539 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1544 st c_2,rp(4) !r[4]=c2;
1546 umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
1553 st c_3,rp(5) !r[5]=c3;
1556 umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1559 addxcc c_2,t_2,c_2 !=
1560 st c_1,rp(6) !r[6]=c1;
1561 st c_2,rp(7) !r[7]=c2;
1566 .type bn_sqr_comba4,#function
1567 .size bn_sqr_comba4,(.-bn_sqr_comba4)