1 .ident "sparcv8.s, Version 1.2"
2 .ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
5 * ====================================================================
6 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
12 * ====================================================================
16 * This is my modest contributon to OpenSSL project (see
17 * http://www.openssl.org/ for more information about it) and is
18 * a drop-in SuperSPARC ISA replacement for crypto/bn/bn_asm.c
19 * module. For updates see http://fy.chalmers.se/~appro/hpe/.
21 * See bn_asm.sparc.v8plus.S for more details.
27 * 1.1 - new loop unrolling model(*);
28 * 1.2 - made gas friendly;
30 * (*) see bn_asm.sparc.v8plus.S for details
33 .section ".text",#alloc,#execinstr
34 .file "bn_asm.sparc.v8.S"
38 .global bn_mul_add_words
40 * BN_ULONG bn_mul_add_words(rp,ap,num,w)
47 bg,a .L_bn_mul_add_words_proceed
52 .L_bn_mul_add_words_proceed:
54 bz .L_bn_mul_add_words_tail
63 ba .L_bn_mul_add_words_warm_loop
66 .L_bn_mul_add_words_loop:
78 .L_bn_mul_add_words_warm_loop:
111 bnz,a .L_bn_mul_add_words_loop
115 bnz,a .L_bn_mul_add_words_tail
117 .L_bn_mul_add_words_return:
122 .L_bn_mul_add_words_tail:
131 bz .L_bn_mul_add_words_return
144 bz .L_bn_mul_add_words_return
158 .type bn_mul_add_words,#function
159 .size bn_mul_add_words,(.-bn_mul_add_words)
165 * BN_ULONG bn_mul_words(rp,ap,num,w)
172 bg,a .L_bn_mul_words_proceeed
177 .L_bn_mul_words_proceeed:
179 bz .L_bn_mul_words_tail
182 .L_bn_mul_words_loop:
214 bnz,a .L_bn_mul_words_loop
218 bnz,a .L_bn_mul_words_tail
220 .L_bn_mul_words_return:
225 .L_bn_mul_words_tail:
231 bz .L_bn_mul_words_return
241 bz .L_bn_mul_words_return
252 .type bn_mul_words,#function
253 .size bn_mul_words,(.-bn_mul_words)
258 * void bn_sqr_words(r,a,n)
264 bg,a .L_bn_sqr_words_proceeed
269 .L_bn_sqr_words_proceeed:
271 bz .L_bn_sqr_words_tail
274 .L_bn_sqr_words_loop:
302 bnz,a .L_bn_sqr_words_loop
307 bnz,a .L_bn_sqr_words_tail
309 .L_bn_sqr_words_return:
313 .L_bn_sqr_words_tail:
318 bz .L_bn_sqr_words_return
327 bz .L_bn_sqr_words_return
338 .type bn_sqr_words,#function
339 .size bn_sqr_words,(.-bn_sqr_words)
345 * BN_ULONG bn_div_words(h,l,d)
354 .type bn_div_words,#function
355 .size bn_div_words,(.-bn_div_words)
361 * BN_ULONG bn_add_words(rp,ap,bp,n)
362 * BN_ULONG *rp,*ap,*bp;
367 bg,a .L_bn_add_words_proceed
372 .L_bn_add_words_proceed:
374 bz .L_bn_add_words_tail
381 ba .L_bn_add_words_warm_loop
385 .L_bn_add_words_loop:
393 .L_bn_add_words_warm_loop:
412 bnz,a .L_bn_add_words_loop
417 bnz,a .L_bn_add_words_tail
419 .L_bn_add_words_return:
423 .L_bn_add_words_tail:
429 bz .L_bn_add_words_return
439 bz .L_bn_add_words_return
450 .type bn_add_words,#function
451 .size bn_add_words,(.-bn_add_words)
457 * BN_ULONG bn_sub_words(rp,ap,bp,n)
458 * BN_ULONG *rp,*ap,*bp;
463 bg,a .L_bn_sub_words_proceed
468 .L_bn_sub_words_proceed:
470 bz .L_bn_sub_words_tail
477 ba .L_bn_sub_words_warm_loop
481 .L_bn_sub_words_loop:
489 .L_bn_sub_words_warm_loop:
508 bnz,a .L_bn_sub_words_loop
513 bnz,a .L_bn_sub_words_tail
515 .L_bn_sub_words_return:
519 .L_bn_sub_words_tail:
525 bz .L_bn_sub_words_return
535 bz .L_bn_sub_words_return
546 .type bn_sub_words,#function
547 .size bn_sub_words,(.-bn_sub_words)
549 #define FRAME_SIZE -96
552 * Here is register usage map for *all* routines below.
560 #define a(I) [%i1+4*I]
561 #define b(I) [%i2+4*I]
562 #define r(I) [%i0+4*I]
583 .global bn_mul_comba8
585 * void bn_mul_comba8(r,a,b)
589 save %sp,FRAME_SIZE,%sp
592 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
595 st c_1,r(0) !r[0]=c1;
597 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
601 addxcc %g0,t_2,c_3 !=
604 umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
608 st c_2,r(1) !r[1]=c2;
611 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
614 addxcc c_1,t_2,c_1 !=
617 umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
623 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
626 addxcc c_1,t_2,c_1 !=
628 st c_3,r(2) !r[2]=c3;
630 umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
635 umul a_1,b_2,t_1 !=!mul_add_c(a[1],b[2],c1,c2,c3);
641 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
647 umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
652 st c_1,r(3) !r[3]=c1;
654 umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
659 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
664 umul a_2,b_2,t_1 !=!mul_add_c(a[2],b[2],c2,c3,c1);
670 umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
676 umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
681 st c_2,r(4) !r[4]=c2;
683 umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
688 umul a_1,b_4,t_1 !mul_add_c(a[1],b[4],c3,c1,c2);
693 umul a_2,b_3,t_1 !=!mul_add_c(a[2],b[3],c3,c1,c2);
698 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
701 addxcc c_1,t_2,c_1 !=
704 umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
710 umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
713 addxcc c_1,t_2,c_1 !=
715 st c_3,r(5) !r[5]=c3;
717 umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
722 umul a_5,b_1,t_1 !=!mul_add_c(a[5],b[1],c1,c2,c3);
727 umul a_4,b_2,t_1 !mul_add_c(a[4],b[2],c1,c2,c3);
730 addxcc c_2,t_2,c_2 !=
732 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
737 umul a_2,b_4,t_1 !mul_add_c(a[2],b[4],c1,c2,c3);
743 umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
746 addxcc c_2,t_2,c_2 !=
749 umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
753 st c_1,r(6) !r[6]=c1;
756 umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
759 addxcc c_3,t_2,c_3 !=
761 umul a_1,b_6,t_1 !mul_add_c(a[1],b[6],c2,c3,c1);
766 umul a_2,b_5,t_1 !mul_add_c(a[2],b[5],c2,c3,c1);
771 umul a_3,b_4,t_1 !=!mul_add_c(a[3],b[4],c2,c3,c1);
776 umul a_4,b_3,t_1 !mul_add_c(a[4],b[3],c2,c3,c1);
779 addxcc c_3,t_2,c_3 !=
781 umul a_5,b_2,t_1 !mul_add_c(a[5],b[2],c2,c3,c1);
787 umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
792 umul a_7,b_0,t_1 !mul_add_c(a[7],b[0],c2,c3,c1);
795 addxcc c_3,t_2,c_3 !=
797 st c_2,r(7) !r[7]=c2;
799 umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
804 umul a_6,b_2,t_1 !=!mul_add_c(a[6],b[2],c3,c1,c2);
809 umul a_5,b_3,t_1 !mul_add_c(a[5],b[3],c3,c1,c2);
812 addxcc c_1,t_2,c_1 !=
814 umul a_4,b_4,t_1 !mul_add_c(a[4],b[4],c3,c1,c2);
819 umul a_3,b_5,t_1 !mul_add_c(a[3],b[5],c3,c1,c2);
824 umul a_2,b_6,t_1 !=!mul_add_c(a[2],b[6],c3,c1,c2);
829 umul a_1,b_7,t_1 !mul_add_c(a[1],b[7],c3,c1,c2);
834 st c_3,r(8) !r[8]=c3;
836 umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
841 umul a_3,b_6,t_1 !=!mul_add_c(a[3],b[6],c1,c2,c3);
846 umul a_4,b_5,t_1 !mul_add_c(a[4],b[5],c1,c2,c3);
849 addxcc c_2,t_2,c_2 !=
851 umul a_5,b_4,t_1 !mul_add_c(a[5],b[4],c1,c2,c3);
856 umul a_6,b_3,t_1 !mul_add_c(a[6],b[3],c1,c2,c3);
861 umul a_7,b_2,t_1 !=!mul_add_c(a[7],b[2],c1,c2,c3);
866 st c_1,r(9) !r[9]=c1;
868 umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
873 umul a_6,b_4,t_1 !mul_add_c(a[6],b[4],c2,c3,c1);
878 umul a_5,b_5,t_1 !=!mul_add_c(a[5],b[5],c2,c3,c1);
883 umul a_4,b_6,t_1 !mul_add_c(a[4],b[6],c2,c3,c1);
886 addxcc c_3,t_2,c_3 !=
888 umul a_3,b_7,t_1 !mul_add_c(a[3],b[7],c2,c3,c1);
893 st c_2,r(10) !r[10]=c2;
895 umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
900 umul a_5,b_6,t_1 !mul_add_c(a[5],b[6],c3,c1,c2);
903 addxcc c_1,t_2,c_1 !=
905 umul a_6,b_5,t_1 !mul_add_c(a[6],b[5],c3,c1,c2);
910 umul a_7,b_4,t_1 !mul_add_c(a[7],b[4],c3,c1,c2);
914 st c_3,r(11) !r[11]=c3;
917 umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
920 addxcc c_2,t_2,c_2 !=
922 umul a_6,b_6,t_1 !mul_add_c(a[6],b[6],c1,c2,c3);
927 umul a_5,b_7,t_1 !mul_add_c(a[5],b[7],c1,c2,c3);
931 st c_1,r(12) !r[12]=c1;
934 umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
937 addxcc c_3,t_2,c_3 !=
939 umul a_7,b_6,t_1 !mul_add_c(a[7],b[6],c2,c3,c1);
944 st c_2,r(13) !r[13]=c2;
946 umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
951 st c_3,r(14) !r[14]=c3;
952 st c_1,r(15) !r[15]=c1;
957 .type bn_mul_comba8,#function
958 .size bn_mul_comba8,(.-bn_mul_comba8)
962 .global bn_mul_comba4
964 * void bn_mul_comba4(r,a,b)
968 save %sp,FRAME_SIZE,%sp
971 umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
974 st c_1,r(0) !r[0]=c1;
976 umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
983 umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
988 st c_2,r(1) !r[1]=c2;
990 umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
996 umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
1002 umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
1007 st c_3,r(2) !r[2]=c3;
1009 umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
1014 umul a_1,b_2,t_1 !mul_add_c(a[1],b[2],c1,c2,c3);
1017 addxcc c_2,t_2,c_2 !=
1020 umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
1021 addcc c_1,t_1,c_1 !=
1025 umul a_3,b_0,t_1 !=!mul_add_c(a[3],b[0],c1,c2,c3);
1030 st c_1,r(3) !r[3]=c1;
1032 umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
1037 umul a_2,b_2,t_1 !mul_add_c(a[2],b[2],c2,c3,c1);
1038 addcc c_2,t_1,c_2 !=
1042 umul a_1,b_3,t_1 !=!mul_add_c(a[1],b[3],c2,c3,c1);
1047 st c_2,r(4) !r[4]=c2;
1049 umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
1054 umul a_3,b_2,t_1 !mul_add_c(a[3],b[2],c3,c1,c2);
1055 addcc c_3,t_1,c_3 !=
1058 st c_3,r(5) !r[5]=c3;
1061 umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
1064 addxcc c_2,t_2,c_2 !=
1065 st c_1,r(6) !r[6]=c1;
1066 st c_2,r(7) !r[7]=c2;
1071 .type bn_mul_comba4,#function
1072 .size bn_mul_comba4,(.-bn_mul_comba4)
1076 .global bn_sqr_comba8
1078 save %sp,FRAME_SIZE,%sp
1081 umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
1083 st c_1,r(0) !r[0]=c1;
1086 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1093 st c_2,r(1) !r[1]=c2;
1096 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1099 addxcc c_1,t_2,c_1 !=
1105 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1110 st c_3,r(2) !r[2]=c3;
1112 umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
1121 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1124 addxcc c_2,t_2,c_2 !=
1129 st c_1,r(3) !r[3]=c1;
1131 umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
1137 addxcc c_3,t_2,c_3 !=
1139 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1145 addxcc c_3,t_2,c_3 !=
1148 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1149 addcc c_2,t_1,c_2 !=
1152 st c_2,r(4) !r[4]=c2;
1155 umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
1158 addxcc c_1,t_2,c_1 !=
1163 umul a_1,a_4,t_1 !sqr_add_c2(a,4,1,c3,c1,c2);
1166 addxcc c_1,t_2,c_1 !=
1172 umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
1178 addxcc c_1,t_2,c_1 !=
1180 st c_3,r(5) !r[5]=c3;
1182 umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
1183 addcc c_1,t_1,c_1 !=
1187 addcc c_1,t_1,c_1 !=
1190 umul a_5,a_1,t_1 !sqr_add_c2(a,5,1,c1,c2,c3);
1191 addcc c_1,t_1,c_1 !=
1195 addcc c_1,t_1,c_1 !=
1198 umul a_4,a_2,t_1 !sqr_add_c2(a,4,2,c1,c2,c3);
1199 addcc c_1,t_1,c_1 !=
1203 addcc c_1,t_1,c_1 !=
1207 umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
1212 st c_1,r(6) !r[6]=c1;
1214 umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
1220 addxcc c_3,t_2,c_3 !=
1222 umul a_1,a_6,t_1 !sqr_add_c2(a,6,1,c2,c3,c1);
1228 addxcc c_3,t_2,c_3 !=
1230 umul a_2,a_5,t_1 !sqr_add_c2(a,5,2,c2,c3,c1);
1236 addxcc c_3,t_2,c_3 !=
1238 umul a_3,a_4,t_1 !sqr_add_c2(a,4,3,c2,c3,c1);
1244 addxcc c_3,t_2,c_3 !=
1246 st c_2,r(7) !r[7]=c2;
1248 umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
1249 addcc c_3,t_1,c_3 !=
1253 addcc c_3,t_1,c_3 !=
1256 umul a_6,a_2,t_1 !sqr_add_c2(a,6,2,c3,c1,c2);
1257 addcc c_3,t_1,c_3 !=
1261 addcc c_3,t_1,c_3 !=
1264 umul a_5,a_3,t_1 !sqr_add_c2(a,5,3,c3,c1,c2);
1265 addcc c_3,t_1,c_3 !=
1269 addcc c_3,t_1,c_3 !=
1272 umul a_4,a_4,t_1 !sqr_add_c(a,4,c3,c1,c2);
1273 addcc c_3,t_1,c_3 !=
1276 st c_3,r(8) !r[8]=c3;
1279 umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
1282 addxcc c_2,t_2,c_2 !=
1287 umul a_3,a_6,t_1 !sqr_add_c2(a,6,3,c1,c2,c3);
1290 addxcc c_2,t_2,c_2 !=
1295 umul a_4,a_5,t_1 !sqr_add_c2(a,5,4,c1,c2,c3);
1298 addxcc c_2,t_2,c_2 !=
1303 st c_1,r(9) !r[9]=c1;
1305 umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
1311 addxcc c_3,t_2,c_3 !=
1313 umul a_6,a_4,t_1 !sqr_add_c2(a,6,4,c2,c3,c1);
1319 addxcc c_3,t_2,c_3 !=
1321 umul a_5,a_5,t_1 !sqr_add_c(a,5,c2,c3,c1);
1326 st c_2,r(10) !r[10]=c2;
1328 umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
1336 umul a_5,a_6,t_1 !=!sqr_add_c2(a,6,5,c3,c1,c2);
1343 st c_3,r(11) !r[11]=c3;
1346 umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
1349 addxcc c_2,t_2,c_2 !=
1354 umul a_6,a_6,t_1 !sqr_add_c(a,6,c1,c2,c3);
1357 addxcc c_2,t_2,c_2 !=
1359 st c_1,r(12) !r[12]=c1;
1361 umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
1362 addcc c_2,t_1,c_2 !=
1366 addcc c_2,t_1,c_2 !=
1369 st c_2,r(13) !r[13]=c2;
1372 umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
1375 addxcc c_1,t_2,c_1 !=
1376 st c_3,r(14) !r[14]=c3;
1377 st c_1,r(15) !r[15]=c1;
1382 .type bn_sqr_comba8,#function
1383 .size bn_sqr_comba8,(.-bn_sqr_comba8)
1387 .global bn_sqr_comba4
1389 * void bn_sqr_comba4(r,a)
1393 save %sp,FRAME_SIZE,%sp
1395 umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
1398 st c_1,r(0) !r[0]=c1;
1401 umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
1410 st c_2,r(1) !r[1]=c2;
1412 umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
1418 addxcc c_1,t_2,c_1 !=
1421 umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
1422 addcc c_3,t_1,c_3 !=
1425 st c_3,r(2) !r[2]=c3;
1428 umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
1431 addxcc c_2,t_2,c_2 !=
1436 umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
1439 addxcc c_2,t_2,c_2 !=
1444 st c_1,r(3) !r[3]=c1;
1446 umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
1452 addxcc c_3,t_2,c_3 !=
1454 umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
1459 st c_2,r(4) !r[4]=c2;
1461 umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
1468 st c_3,r(5) !r[5]=c3;
1471 umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
1474 addxcc c_2,t_2,c_2 !=
1475 st c_1,r(6) !r[6]=c1;
1476 st c_2,r(7) !r[7]=c2;
1481 .type bn_sqr_comba4,#function
1482 .size bn_sqr_comba4,(.-bn_sqr_comba4)