2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
14 # Rights for redistribution and usage in source and binary forms are
15 # granted according to the OpenSSL license. Warranty of any kind is
17 # ====================================================================
22 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
24 # The module is designed to work with either of the "new" MIPS ABI(5),
25 # namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
26 # IRIX 5.x not only because it doesn't support new ABIs but also
27 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
28 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
29 # cause illegal instruction exception:-(
31 # In addition the code depends on preprocessor flags set up by MIPSpro
32 # compiler driver (either as or cc) and therefore (probably?) can't be
33 # compiled by the GNU assembler. GNU C driver manages fine though...
34 # I mean as long as -mmips-as is specified or is the default option,
35 # because then it simply invokes /usr/bin/as which in turn takes
36 # perfect care of the preprocessor definitions. Another neat feature
37 # offered by the MIPSpro assembler is an optimization pass. This gave
38 # me the opportunity to have the code looking more regular as all those
39 # architecture dependent instruction rescheduling details were left to
40 # the assembler. Cool, huh?
42 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
43 # goes way over 3 times faster!
45 # <appro@fy.chalmers.se>
49 # Adapt the module even for 32-bit ABIs and other OSes. The former was
50 # achieved by mechanical replacement of 64-bit arithmetic instructions
51 # such as dmultu, daddu, etc. with their 32-bit counterparts and
52 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
53 # >3x performance improvement naturally does not apply to 32-bit code
54 # [because there is no instruction 32-bit compiler can't use], one
55 # has to content with 40-85% improvement depending on benchmark and
56 # key length, more for longer keys.
58 $flavour = shift || "o32";
59 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
60 open STDOUT,">$output";
62 if ($flavour =~ /64|n32/i) {
95 # Below is N32/64 register layout used in the original module.
97 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
98 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
99 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
100 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
101 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
102 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
104 # No special adaptation is required for O32. NUBI on the other hand
105 # is treated by saving/restoring ($v1,$t0..$t3).
107 $gp=$v1 if ($flavour =~ /nubi/i);
113 .asciiz "mips3.s, Version 1.2"
114 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
120 .globl bn_mul_add_words
121 .ent bn_mul_add_words
124 bgtz $a2,bn_mul_add_words_internal
128 .end bn_mul_add_words
131 .ent bn_mul_add_words_internal
132 bn_mul_add_words_internal:
134 $code.=<<___ if ($flavour =~ /nubi/i);
135 .frame $sp,6*$SZREG,$ra
136 .mask 0x8000f008,-$SZREG
138 $PTR_SUB $sp,6*$SZREG
139 $REG_S $ra,5*$SZREG($sp)
140 $REG_S $t3,4*$SZREG($sp)
141 $REG_S $t2,3*$SZREG($sp)
142 $REG_S $t1,2*$SZREG($sp)
143 $REG_S $t0,1*$SZREG($sp)
144 $REG_S $gp,0*$SZREG($sp)
150 beqz $ta0,.L_bn_mul_add_words_tail
152 .L_bn_mul_add_words_loop:
158 $LD $ta0,2*$BNSZ($a1)
159 $LD $ta1,2*$BNSZ($a0)
161 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
162 # values", but it seems to work fine
163 # even on 64-bit registers.
173 $LD $ta2,3*$BNSZ($a1)
174 $LD $ta3,3*$BNSZ($a0)
197 $ST $ta1,-2*$BNSZ($a0)
211 bgtz $ta0,.L_bn_mul_add_words_loop
214 beqz $a2,.L_bn_mul_add_words_return
217 .L_bn_mul_add_words_tail:
232 beqz $a2,.L_bn_mul_add_words_return
247 beqz $a2,.L_bn_mul_add_words_return
262 .L_bn_mul_add_words_return:
265 $code.=<<___ if ($flavour =~ /nubi/i);
266 $REG_L $t3,4*$SZREG($sp)
267 $REG_L $t2,3*$SZREG($sp)
268 $REG_L $t1,2*$SZREG($sp)
269 $REG_L $t0,1*$SZREG($sp)
270 $REG_L $gp,0*$SZREG($sp)
271 $PTR_ADD $sp,6*$SZREG
276 .end bn_mul_add_words_internal
283 bgtz $a2,bn_mul_words_internal
290 .ent bn_mul_words_internal
291 bn_mul_words_internal:
293 $code.=<<___ if ($flavour =~ /nubi/i);
294 .frame $sp,6*$SZREG,$ra
295 .mask 0x8000f008,-$SZREG
297 $PTR_SUB $sp,6*$SZREG
298 $REG_S $ra,5*$SZREG($sp)
299 $REG_S $t3,4*$SZREG($sp)
300 $REG_S $t2,3*$SZREG($sp)
301 $REG_S $t1,2*$SZREG($sp)
302 $REG_S $t0,1*$SZREG($sp)
303 $REG_S $gp,0*$SZREG($sp)
309 beqz $ta0,.L_bn_mul_words_tail
311 .L_bn_mul_words_loop:
315 $LD $ta0,2*$BNSZ($a1)
316 $LD $ta2,3*$BNSZ($a1)
333 $ST $v0,-3*$BNSZ($a0)
341 $ST $v0,-2*$BNSZ($a0)
351 bgtz $ta0,.L_bn_mul_words_loop
354 beqz $a2,.L_bn_mul_words_return
357 .L_bn_mul_words_tail:
368 beqz $a2,.L_bn_mul_words_return
379 beqz $a2,.L_bn_mul_words_return
390 .L_bn_mul_words_return:
393 $code.=<<___ if ($flavour =~ /nubi/i);
394 $REG_L $t3,4*$SZREG($sp)
395 $REG_L $t2,3*$SZREG($sp)
396 $REG_L $t1,2*$SZREG($sp)
397 $REG_L $t0,1*$SZREG($sp)
398 $REG_L $gp,0*$SZREG($sp)
399 $PTR_ADD $sp,6*$SZREG
404 .end bn_mul_words_internal
411 bgtz $a2,bn_sqr_words_internal
418 .ent bn_sqr_words_internal
419 bn_sqr_words_internal:
421 $code.=<<___ if ($flavour =~ /nubi/i);
422 .frame $sp,6*$SZREG,$ra
423 .mask 0x8000f008,-$SZREG
425 $PTR_SUB $sp,6*$SZREG
426 $REG_S $ra,5*$SZREG($sp)
427 $REG_S $t3,4*$SZREG($sp)
428 $REG_S $t2,3*$SZREG($sp)
429 $REG_S $t1,2*$SZREG($sp)
430 $REG_S $t0,1*$SZREG($sp)
431 $REG_S $gp,0*$SZREG($sp)
437 beqz $ta0,.L_bn_sqr_words_tail
439 .L_bn_sqr_words_loop:
443 $LD $ta0,2*$BNSZ($a1)
444 $LD $ta2,3*$BNSZ($a1)
456 $ST $t3,-6*$BNSZ($a0)
457 $ST $t2,-5*$BNSZ($a0)
462 $ST $ta1,-4*$BNSZ($a0)
463 $ST $ta0,-3*$BNSZ($a0)
470 $ST $ta3,-2*$BNSZ($a0)
473 bgtz $ta0,.L_bn_sqr_words_loop
476 beqz $a2,.L_bn_sqr_words_return
479 .L_bn_sqr_words_tail:
488 beqz $a2,.L_bn_sqr_words_return
497 beqz $a2,.L_bn_sqr_words_return
506 .L_bn_sqr_words_return:
509 $code.=<<___ if ($flavour =~ /nubi/i);
510 $REG_L $t3,4*$SZREG($sp)
511 $REG_L $t2,3*$SZREG($sp)
512 $REG_L $t1,2*$SZREG($sp)
513 $REG_L $t0,1*$SZREG($sp)
514 $REG_L $gp,0*$SZREG($sp)
515 $PTR_ADD $sp,6*$SZREG
521 .end bn_sqr_words_internal
528 bgtz $a3,bn_add_words_internal
535 .ent bn_add_words_internal
536 bn_add_words_internal:
538 $code.=<<___ if ($flavour =~ /nubi/i);
539 .frame $sp,6*$SZREG,$ra
540 .mask 0x8000f008,-$SZREG
542 $PTR_SUB $sp,6*$SZREG
543 $REG_S $ra,5*$SZREG($sp)
544 $REG_S $t3,4*$SZREG($sp)
545 $REG_S $t2,3*$SZREG($sp)
546 $REG_S $t1,2*$SZREG($sp)
547 $REG_S $t0,1*$SZREG($sp)
548 $REG_S $gp,0*$SZREG($sp)
554 beqz $at,.L_bn_add_words_tail
556 .L_bn_add_words_loop:
566 $LD $ta1,-3*$BNSZ($a2)
568 $LD $ta2,-2*$BNSZ($a2)
574 $ST $t0,-4*$BNSZ($a0)
581 $ST $t1,-3*$BNSZ($a0)
588 $ST $t2,-2*$BNSZ($a0)
598 bgtz $at,.L_bn_add_words_loop
601 beqz $a3,.L_bn_add_words_return
604 .L_bn_add_words_tail:
615 beqz $a3,.L_bn_add_words_return
626 beqz $a3,.L_bn_add_words_return
629 $LD $ta2,2*$BNSZ($a2)
637 .L_bn_add_words_return:
640 $code.=<<___ if ($flavour =~ /nubi/i);
641 $REG_L $t3,4*$SZREG($sp)
642 $REG_L $t2,3*$SZREG($sp)
643 $REG_L $t1,2*$SZREG($sp)
644 $REG_L $t0,1*$SZREG($sp)
645 $REG_L $gp,0*$SZREG($sp)
646 $PTR_ADD $sp,6*$SZREG
652 .end bn_add_words_internal
659 bgtz $a3,bn_sub_words_internal
666 .ent bn_sub_words_internal
667 bn_sub_words_internal:
669 $code.=<<___ if ($flavour =~ /nubi/i);
670 .frame $sp,6*$SZREG,$ra
671 .mask 0x8000f008,-$SZREG
673 $PTR_SUB $sp,6*$SZREG
674 $REG_S $ra,5*$SZREG($sp)
675 $REG_S $t3,4*$SZREG($sp)
676 $REG_S $t2,3*$SZREG($sp)
677 $REG_S $t1,2*$SZREG($sp)
678 $REG_S $t0,1*$SZREG($sp)
679 $REG_S $gp,0*$SZREG($sp)
685 beqz $at,.L_bn_sub_words_tail
687 .L_bn_sub_words_loop:
697 $LD $ta1,-3*$BNSZ($a2)
699 $LD $ta2,-2*$BNSZ($a2)
705 $ST $t0,-4*$BNSZ($a0)
712 $ST $t1,-3*$BNSZ($a0)
720 $ST $t2,-2*$BNSZ($a0)
730 bgtz $at,.L_bn_sub_words_loop
733 beqz $a3,.L_bn_sub_words_return
736 .L_bn_sub_words_tail:
747 beqz $a3,.L_bn_sub_words_return
758 beqz $a3,.L_bn_sub_words_return
761 $LD $ta2,2*$BNSZ($a2)
769 .L_bn_sub_words_return:
772 $code.=<<___ if ($flavour =~ /nubi/i);
773 $REG_L $t3,4*$SZREG($sp)
774 $REG_L $t2,3*$SZREG($sp)
775 $REG_L $t1,2*$SZREG($sp)
776 $REG_L $t0,1*$SZREG($sp)
777 $REG_L $gp,0*$SZREG($sp)
778 $PTR_ADD $sp,6*$SZREG
783 .end bn_sub_words_internal
786 .globl bn_div_3_words
790 move $a3,$a0 # we know that bn_div_words does not
791 # touch $a3, $ta2, $ta3 and preserves $a2
792 # so that we can save two arguments
793 # and return address in registers
794 # instead of stack:-)
798 bne $a0,$a2,bn_div_3_words_internal
806 .ent bn_div_3_words_internal
807 bn_div_3_words_internal:
809 $code.=<<___ if ($flavour =~ /nubi/i);
810 .frame $sp,6*$SZREG,$ra
811 .mask 0x8000f008,-$SZREG
813 $PTR_SUB $sp,6*$SZREG
814 $REG_S $ra,5*$SZREG($sp)
815 $REG_S $t3,4*$SZREG($sp)
816 $REG_S $t2,3*$SZREG($sp)
817 $REG_S $t1,2*$SZREG($sp)
818 $REG_S $t0,1*$SZREG($sp)
819 $REG_S $gp,0*$SZREG($sp)
824 bal bn_div_words_internal
827 $LD $t2,-2*$BNSZ($a3)
832 .L_bn_div_3_words_inner_loop:
833 bnez $t8,.L_bn_div_3_words_inner_loop_done
845 beqz $at,.L_bn_div_3_words_inner_loop
849 .L_bn_div_3_words_inner_loop_done:
852 $code.=<<___ if ($flavour =~ /nubi/i);
853 $REG_L $t3,4*$SZREG($sp)
854 $REG_L $t2,3*$SZREG($sp)
855 $REG_L $t1,2*$SZREG($sp)
856 $REG_L $t0,1*$SZREG($sp)
857 $REG_L $gp,0*$SZREG($sp)
858 $PTR_ADD $sp,6*$SZREG
863 .end bn_div_3_words_internal
870 bnez $a2,bn_div_words_internal
871 li $v0,-1 # I would rather signal div-by-zero
872 # which can be done with 'break 7'
878 .ent bn_div_words_internal
879 bn_div_words_internal:
881 $code.=<<___ if ($flavour =~ /nubi/i);
882 .frame $sp,6*$SZREG,$ra
883 .mask 0x8000f008,-$SZREG
885 $PTR_SUB $sp,6*$SZREG
886 $REG_S $ra,5*$SZREG($sp)
887 $REG_S $t3,4*$SZREG($sp)
888 $REG_S $t2,3*$SZREG($sp)
889 $REG_S $t1,2*$SZREG($sp)
890 $REG_S $t0,1*$SZREG($sp)
891 $REG_S $gp,0*$SZREG($sp)
895 bltz $a2,.L_bn_div_words_body
910 break 6 # signal overflow
920 .L_bn_div_words_body:
921 $SRL $DH,$a2,4*$BNSZ # bits
930 $SRL $HH,$a0,4*$BNSZ # bits
931 $SRL $QT,4*$BNSZ # q=0xffffffff
932 beq $DH,$HH,.L_bn_div_words_skip_div1
935 .L_bn_div_words_skip_div1:
937 $SLL $t3,$a0,4*$BNSZ # bits
938 $SRL $at,$a1,4*$BNSZ # bits
942 .L_bn_div_words_inner_loop1:
950 beqz $at,.L_bn_div_words_inner_loop1_done
953 b .L_bn_div_words_inner_loop1
956 .L_bn_div_words_inner_loop1_done:
958 $SLL $a1,4*$BNSZ # bits
960 $SLL $v0,$QT,4*$BNSZ # bits
963 $SRL $HH,$a0,4*$BNSZ # bits
964 $SRL $QT,4*$BNSZ # q=0xffffffff
965 beq $DH,$HH,.L_bn_div_words_skip_div2
968 .L_bn_div_words_skip_div2:
970 $SLL $t3,$a0,4*$BNSZ # bits
971 $SRL $at,$a1,4*$BNSZ # bits
975 .L_bn_div_words_inner_loop2:
983 beqz $at,.L_bn_div_words_inner_loop2_done
986 b .L_bn_div_words_inner_loop2
989 .L_bn_div_words_inner_loop2_done:
993 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
994 $SRL $a2,$t9 # restore $a2
999 $code.=<<___ if ($flavour =~ /nubi/i);
1000 $REG_L $t3,4*$SZREG($sp)
1001 $REG_L $t2,3*$SZREG($sp)
1002 $REG_L $t1,2*$SZREG($sp)
1003 $REG_L $t0,1*$SZREG($sp)
1004 $REG_L $gp,0*$SZREG($sp)
1005 $PTR_ADD $sp,6*$SZREG
1010 .end bn_div_words_internal
1012 undef $HH; undef $QT; undef $DH;
1014 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1015 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1017 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1018 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1020 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1025 .globl bn_mul_comba8
1030 $code.=<<___ if ($flavour =~ /nubi/i);
1031 .frame $sp,12*$SZREG,$ra
1032 .mask 0x803ff008,-$SZREG
1033 $PTR_SUB $sp,12*$SZREG
1034 $REG_S $ra,11*$SZREG($sp)
1035 $REG_S $s5,10*$SZREG($sp)
1036 $REG_S $s4,9*$SZREG($sp)
1037 $REG_S $s3,8*$SZREG($sp)
1038 $REG_S $s2,7*$SZREG($sp)
1039 $REG_S $s1,6*$SZREG($sp)
1040 $REG_S $s0,5*$SZREG($sp)
1041 $REG_S $t3,4*$SZREG($sp)
1042 $REG_S $t2,3*$SZREG($sp)
1043 $REG_S $t1,2*$SZREG($sp)
1044 $REG_S $t0,1*$SZREG($sp)
1045 $REG_S $gp,0*$SZREG($sp)
1047 $code.=<<___ if ($flavour !~ /nubi/i);
1048 .frame $sp,6*$SZREG,$ra
1049 .mask 0x003f0000,-$SZREG
1050 $PTR_SUB $sp,6*$SZREG
1051 $REG_S $s5,5*$SZREG($sp)
1052 $REG_S $s4,4*$SZREG($sp)
1053 $REG_S $s3,3*$SZREG($sp)
1054 $REG_S $s2,2*$SZREG($sp)
1055 $REG_S $s1,1*$SZREG($sp)
1056 $REG_S $s0,0*$SZREG($sp)
1061 $LD $a_0,0($a1) # If compiled with -mips3 option on
1062 # R5000 box assembler barks on this
1063 # 1ine with "should not have mult/div
1064 # as last instruction in bb (R10K
1065 # bug)" warning. If anybody out there
1066 # has a clue about how to circumvent
1067 # this do send me a note.
1068 # <appro\@fy.chalmers.se>
1072 $LD $a_2,2*$BNSZ($a1)
1073 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1074 $LD $a_3,3*$BNSZ($a1)
1076 $LD $b_2,2*$BNSZ($a2)
1077 $LD $b_3,3*$BNSZ($a2)
1081 $LD $a_4,4*$BNSZ($a1)
1082 $LD $a_5,5*$BNSZ($a1)
1083 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1084 $LD $a_6,6*$BNSZ($a1)
1085 $LD $a_7,7*$BNSZ($a1)
1086 $LD $b_4,4*$BNSZ($a2)
1087 $LD $b_5,5*$BNSZ($a2)
1092 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1094 $LD $b_6,6*$BNSZ($a2)
1095 $LD $b_7,7*$BNSZ($a2)
1096 $ST $c_1,0($a0) # r[0]=c1;
1101 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1105 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1111 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1118 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1126 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1131 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1137 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1145 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1154 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1163 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1);
1168 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1174 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1182 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1191 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1200 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1);
1209 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2);
1214 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1220 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2);
1228 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1237 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1246 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2);
1255 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2);
1264 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3);
1269 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1275 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3);
1283 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3);
1292 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1301 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3);
1310 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3);
1319 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3);
1328 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1);
1333 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1339 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1);
1347 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1);
1356 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1);
1365 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1);
1374 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1);
1383 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1);
1392 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1);
1401 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2);
1406 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1412 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2);
1420 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2);
1429 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2);
1438 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2);
1447 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2);
1456 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2);
1465 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3);
1470 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1476 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3);
1484 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3);
1493 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3);
1502 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3);
1511 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3);
1520 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1);
1525 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1531 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1);
1539 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1);
1548 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1);
1557 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1);
1566 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2);
1571 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1577 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2);
1585 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2);
1594 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2);
1603 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3);
1608 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1614 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3);
1622 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3);
1631 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1);
1636 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1642 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1);
1650 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2);
1655 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1663 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1664 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1668 $code.=<<___ if ($flavour =~ /nubi/i);
1669 $REG_L $s5,10*$SZREG($sp)
1670 $REG_L $s4,9*$SZREG($sp)
1671 $REG_L $s3,8*$SZREG($sp)
1672 $REG_L $s2,7*$SZREG($sp)
1673 $REG_L $s1,6*$SZREG($sp)
1674 $REG_L $s0,5*$SZREG($sp)
1675 $REG_L $t3,4*$SZREG($sp)
1676 $REG_L $t2,3*$SZREG($sp)
1677 $REG_L $t1,2*$SZREG($sp)
1678 $REG_L $t0,1*$SZREG($sp)
1679 $REG_L $gp,0*$SZREG($sp)
1681 $PTR_ADD $sp,12*$SZREG
1683 $code.=<<___ if ($flavour !~ /nubi/i);
1684 $REG_L $s5,5*$SZREG($sp)
1685 $REG_L $s4,4*$SZREG($sp)
1686 $REG_L $s3,3*$SZREG($sp)
1687 $REG_L $s2,2*$SZREG($sp)
1688 $REG_L $s1,1*$SZREG($sp)
1689 $REG_L $s0,0*$SZREG($sp)
1691 $PTR_ADD $sp,6*$SZREG
1697 .globl bn_mul_comba4
1701 $code.=<<___ if ($flavour =~ /nubi/i);
1702 .frame $sp,6*$SZREG,$ra
1703 .mask 0x8000f008,-$SZREG
1705 $PTR_SUB $sp,6*$SZREG
1706 $REG_S $ra,5*$SZREG($sp)
1707 $REG_S $t3,4*$SZREG($sp)
1708 $REG_S $t2,3*$SZREG($sp)
1709 $REG_S $t1,2*$SZREG($sp)
1710 $REG_S $t0,1*$SZREG($sp)
1711 $REG_S $gp,0*$SZREG($sp)
1718 $LD $a_2,2*$BNSZ($a1)
1719 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1720 $LD $a_3,3*$BNSZ($a1)
1722 $LD $b_2,2*$BNSZ($a2)
1723 $LD $b_3,3*$BNSZ($a2)
1728 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1);
1733 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1);
1739 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2);
1749 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2);
1756 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2);
1764 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3);
1769 $ST $c_3,2*$BNSZ($a0)
1775 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3);
1783 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3);
1792 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3);
1801 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1);
1806 $ST $c_1,3*$BNSZ($a0)
1812 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1);
1820 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1);
1829 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2);
1834 $ST $c_2,4*$BNSZ($a0)
1840 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2);
1848 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3);
1853 $ST $c_3,5*$BNSZ($a0)
1861 $ST $c_1,6*$BNSZ($a0)
1862 $ST $c_2,7*$BNSZ($a0)
1866 $code.=<<___ if ($flavour =~ /nubi/i);
1867 $REG_L $t3,4*$SZREG($sp)
1868 $REG_L $t2,3*$SZREG($sp)
1869 $REG_L $t1,2*$SZREG($sp)
1870 $REG_L $t0,1*$SZREG($sp)
1871 $REG_L $gp,0*$SZREG($sp)
1872 $PTR_ADD $sp,6*$SZREG
1880 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1883 my ($hi,$lo,$c0,$c1,$c2,
1884 $warm, # !$warm denotes first call with specific sequence of
1885 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1886 $an,$bn # these two are arguments for multiplication which
1887 # result is used in *next* step [which is why it's
1888 # commented as "forward multiplication" below];
1895 $MULTU $an,$bn # forward multiplication
1902 $code.=<<___ if (!$warm);
1908 $code.=<<___ if ($warm);
1920 .globl bn_sqr_comba8
1924 $code.=<<___ if ($flavour =~ /nubi/i);
1925 .frame $sp,6*$SZREG,$ra
1926 .mask 0x8000f008,-$SZREG
1928 $PTR_SUB $sp,6*$SZREG
1929 $REG_S $ra,5*$SZREG($sp)
1930 $REG_S $t3,4*$SZREG($sp)
1931 $REG_S $t2,3*$SZREG($sp)
1932 $REG_S $t1,2*$SZREG($sp)
1933 $REG_S $t0,1*$SZREG($sp)
1934 $REG_S $gp,0*$SZREG($sp)
1940 $LD $a_2,2*$BNSZ($a1)
1941 $LD $a_3,3*$BNSZ($a1)
1943 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
1944 $LD $a_4,4*$BNSZ($a1)
1945 $LD $a_5,5*$BNSZ($a1)
1946 $LD $a_6,6*$BNSZ($a1)
1947 $LD $a_7,7*$BNSZ($a1)
1952 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
1957 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
1966 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1967 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1973 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
1978 $ST $c_3,2*$BNSZ($a0)
1980 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1981 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
1982 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
1983 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
1985 $ST $c_1,3*$BNSZ($a0)
1987 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
1988 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
1989 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
1990 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
1996 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2);
2001 $ST $c_2,4*$BNSZ($a0)
2003 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2004 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2005 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2006 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2007 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2008 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2010 $ST $c_3,5*$BNSZ($a0)
2012 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2013 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2014 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2015 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2016 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2017 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2023 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1);
2028 $ST $c_1,6*$BNSZ($a0)
2030 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2031 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2032 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2033 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2034 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2035 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2036 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2037 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2039 $ST $c_2,7*$BNSZ($a0)
2041 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2042 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2043 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2044 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2045 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2046 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2052 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3);
2057 $ST $c_3,8*$BNSZ($a0)
2059 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2060 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2061 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2062 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2063 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2064 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2066 $ST $c_1,9*$BNSZ($a0)
2068 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2069 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2070 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2071 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2077 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2);
2082 $ST $c_2,10*$BNSZ($a0)
2084 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2085 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2086 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2087 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2089 $ST $c_3,11*$BNSZ($a0)
2091 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2092 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2098 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1);
2103 $ST $c_1,12*$BNSZ($a0)
2105 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2106 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2108 $ST $c_2,13*$BNSZ($a0)
2116 $ST $c_3,14*$BNSZ($a0)
2117 $ST $c_1,15*$BNSZ($a0)
2121 $code.=<<___ if ($flavour =~ /nubi/i);
2122 $REG_L $t3,4*$SZREG($sp)
2123 $REG_L $t2,3*$SZREG($sp)
2124 $REG_L $t1,2*$SZREG($sp)
2125 $REG_L $t0,1*$SZREG($sp)
2126 $REG_L $gp,0*$SZREG($sp)
2127 $PTR_ADD $sp,6*$SZREG
2135 .globl bn_sqr_comba4
2139 $code.=<<___ if ($flavour =~ /nubi/i);
2140 .frame $sp,6*$SZREG,$ra
2141 .mask 0x8000f008,-$SZREG
2143 $PTR_SUB $sp,6*$SZREG
2144 $REG_S $ra,5*$SZREG($sp)
2145 $REG_S $t3,4*$SZREG($sp)
2146 $REG_S $t2,3*$SZREG($sp)
2147 $REG_S $t1,2*$SZREG($sp)
2148 $REG_S $t0,1*$SZREG($sp)
2149 $REG_S $gp,0*$SZREG($sp)
2155 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3);
2156 $LD $a_2,2*$BNSZ($a1)
2157 $LD $a_3,3*$BNSZ($a1)
2162 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1);
2167 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2);
2176 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2177 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2183 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3);
2188 $ST $c_3,2*$BNSZ($a0)
2190 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2191 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2192 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2193 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2195 $ST $c_1,3*$BNSZ($a0)
2197 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2198 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2204 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
2209 $ST $c_2,4*$BNSZ($a0)
2211 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2212 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2214 $ST $c_3,5*$BNSZ($a0)
2222 $ST $c_1,6*$BNSZ($a0)
2223 $ST $c_2,7*$BNSZ($a0)
2227 $code.=<<___ if ($flavour =~ /nubi/i);
2228 $REG_L $t3,4*$SZREG($sp)
2229 $REG_L $t2,3*$SZREG($sp)
2230 $REG_L $t1,2*$SZREG($sp)
2231 $REG_L $t0,1*$SZREG($sp)
2232 $REG_L $gp,0*$SZREG($sp)
2233 $PTR_ADD $sp,6*$SZREG