2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # Rights for redistribution and usage in source and binary forms are
15 # granted according to the License. Warranty of any kind is disclaimed.
16 # ====================================================================
21 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
23 # The module is designed to work with either of the "new" MIPS ABI(5),
24 # namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
25 # IRIX 5.x not only because it doesn't support new ABIs but also
26 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
27 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
28 # cause illegal instruction exception:-(
30 # In addition the code depends on preprocessor flags set up by MIPSpro
31 # compiler driver (either as or cc) and therefore (probably?) can't be
32 # compiled by the GNU assembler. GNU C driver manages fine though...
33 # I mean as long as -mmips-as is specified or is the default option,
34 # because then it simply invokes /usr/bin/as which in turn takes
35 # perfect care of the preprocessor definitions. Another neat feature
36 # offered by the MIPSpro assembler is an optimization pass. This gave
37 # me the opportunity to have the code looking more regular as all those
38 # architecture dependent instruction rescheduling details were left to
39 # the assembler. Cool, huh?
41 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
42 # goes way over 3 times faster!
48 # Adapt the module even for 32-bit ABIs and other OSes. The former was
49 # achieved by mechanical replacement of 64-bit arithmetic instructions
50 # such as dmultu, daddu, etc. with their 32-bit counterparts and
51 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
52 # >3x performance improvement naturally does not apply to 32-bit code
53 # [because there is no instruction 32-bit compiler can't use], one
54 # has to content with 40-85% improvement depending on benchmark and
55 # key length, more for longer keys.
57 $flavour = shift || "o32";
58 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
59 open STDOUT,">$output";
61 if ($flavour =~ /64|n32/i) {
94 # Below is N32/64 register layout used in the original module.
96 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
97 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
98 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
99 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
100 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
101 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
103 # No special adaptation is required for O32. NUBI on the other hand
104 # is treated by saving/restoring ($v1,$t0..$t3).
106 $gp=$v1 if ($flavour =~ /nubi/i);
111 #include "mips_arch.h"
113 #if defined(_MIPS_ARCH_MIPS64R6)
114 # define ddivu(rs,rt)
115 # define mfqt(rd,rs,rt) ddivu rd,rs,rt
116 # define mfrm(rd,rs,rt) dmodu rd,rs,rt
117 #elif defined(_MIPS_ARCH_MIPS32R6)
119 # define mfqt(rd,rs,rt) divu rd,rs,rt
120 # define mfrm(rd,rs,rt) modu rd,rs,rt
122 # define $DIVU(rs,rt) $DIVU $zero,rs,rt
123 # define mfqt(rd,rs,rt) mflo rd
124 # define mfrm(rd,rs,rt) mfhi rd
128 .asciiz "mips3.s, Version 1.2"
129 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
135 .globl bn_mul_add_words
136 .ent bn_mul_add_words
139 bgtz $a2,bn_mul_add_words_internal
143 .end bn_mul_add_words
146 .ent bn_mul_add_words_internal
147 bn_mul_add_words_internal:
149 $code.=<<___ if ($flavour =~ /nubi/i);
150 .frame $sp,6*$SZREG,$ra
151 .mask 0x8000f008,-$SZREG
153 $PTR_SUB $sp,6*$SZREG
154 $REG_S $ra,5*$SZREG($sp)
155 $REG_S $t3,4*$SZREG($sp)
156 $REG_S $t2,3*$SZREG($sp)
157 $REG_S $t1,2*$SZREG($sp)
158 $REG_S $t0,1*$SZREG($sp)
159 $REG_S $gp,0*$SZREG($sp)
165 beqz $ta0,.L_bn_mul_add_words_tail
167 .L_bn_mul_add_words_loop:
173 $LD $ta0,2*$BNSZ($a1)
174 $LD $ta1,2*$BNSZ($a0)
176 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
177 # values", but it seems to work fine
178 # even on 64-bit registers.
188 $LD $ta2,3*$BNSZ($a1)
189 $LD $ta3,3*$BNSZ($a0)
212 $ST $ta1,-2*$BNSZ($a0)
226 bgtz $ta0,.L_bn_mul_add_words_loop
229 beqz $a2,.L_bn_mul_add_words_return
232 .L_bn_mul_add_words_tail:
247 beqz $a2,.L_bn_mul_add_words_return
262 beqz $a2,.L_bn_mul_add_words_return
277 .L_bn_mul_add_words_return:
280 $code.=<<___ if ($flavour =~ /nubi/i);
281 $REG_L $t3,4*$SZREG($sp)
282 $REG_L $t2,3*$SZREG($sp)
283 $REG_L $t1,2*$SZREG($sp)
284 $REG_L $t0,1*$SZREG($sp)
285 $REG_L $gp,0*$SZREG($sp)
286 $PTR_ADD $sp,6*$SZREG
291 .end bn_mul_add_words_internal
298 bgtz $a2,bn_mul_words_internal
305 .ent bn_mul_words_internal
306 bn_mul_words_internal:
308 $code.=<<___ if ($flavour =~ /nubi/i);
309 .frame $sp,6*$SZREG,$ra
310 .mask 0x8000f008,-$SZREG
312 $PTR_SUB $sp,6*$SZREG
313 $REG_S $ra,5*$SZREG($sp)
314 $REG_S $t3,4*$SZREG($sp)
315 $REG_S $t2,3*$SZREG($sp)
316 $REG_S $t1,2*$SZREG($sp)
317 $REG_S $t0,1*$SZREG($sp)
318 $REG_S $gp,0*$SZREG($sp)
324 beqz $ta0,.L_bn_mul_words_tail
326 .L_bn_mul_words_loop:
330 $LD $ta0,2*$BNSZ($a1)
331 $LD $ta2,3*$BNSZ($a1)
348 $ST $v0,-3*$BNSZ($a0)
356 $ST $v0,-2*$BNSZ($a0)
366 bgtz $ta0,.L_bn_mul_words_loop
369 beqz $a2,.L_bn_mul_words_return
372 .L_bn_mul_words_tail:
383 beqz $a2,.L_bn_mul_words_return
394 beqz $a2,.L_bn_mul_words_return
405 .L_bn_mul_words_return:
408 $code.=<<___ if ($flavour =~ /nubi/i);
409 $REG_L $t3,4*$SZREG($sp)
410 $REG_L $t2,3*$SZREG($sp)
411 $REG_L $t1,2*$SZREG($sp)
412 $REG_L $t0,1*$SZREG($sp)
413 $REG_L $gp,0*$SZREG($sp)
414 $PTR_ADD $sp,6*$SZREG
419 .end bn_mul_words_internal
426 bgtz $a2,bn_sqr_words_internal
433 .ent bn_sqr_words_internal
434 bn_sqr_words_internal:
436 $code.=<<___ if ($flavour =~ /nubi/i);
437 .frame $sp,6*$SZREG,$ra
438 .mask 0x8000f008,-$SZREG
440 $PTR_SUB $sp,6*$SZREG
441 $REG_S $ra,5*$SZREG($sp)
442 $REG_S $t3,4*$SZREG($sp)
443 $REG_S $t2,3*$SZREG($sp)
444 $REG_S $t1,2*$SZREG($sp)
445 $REG_S $t0,1*$SZREG($sp)
446 $REG_S $gp,0*$SZREG($sp)
452 beqz $ta0,.L_bn_sqr_words_tail
454 .L_bn_sqr_words_loop:
458 $LD $ta0,2*$BNSZ($a1)
459 $LD $ta2,3*$BNSZ($a1)
471 $ST $t3,-6*$BNSZ($a0)
472 $ST $t2,-5*$BNSZ($a0)
475 mflo ($ta1,$ta0,$ta0)
476 mfhi ($ta0,$ta0,$ta0)
477 $ST $ta1,-4*$BNSZ($a0)
478 $ST $ta0,-3*$BNSZ($a0)
483 mflo ($ta3,$ta2,$ta2)
484 mfhi ($ta2,$ta2,$ta2)
485 $ST $ta3,-2*$BNSZ($a0)
488 bgtz $ta0,.L_bn_sqr_words_loop
491 beqz $a2,.L_bn_sqr_words_return
494 .L_bn_sqr_words_tail:
503 beqz $a2,.L_bn_sqr_words_return
512 beqz $a2,.L_bn_sqr_words_return
521 .L_bn_sqr_words_return:
524 $code.=<<___ if ($flavour =~ /nubi/i);
525 $REG_L $t3,4*$SZREG($sp)
526 $REG_L $t2,3*$SZREG($sp)
527 $REG_L $t1,2*$SZREG($sp)
528 $REG_L $t0,1*$SZREG($sp)
529 $REG_L $gp,0*$SZREG($sp)
530 $PTR_ADD $sp,6*$SZREG
536 .end bn_sqr_words_internal
543 bgtz $a3,bn_add_words_internal
550 .ent bn_add_words_internal
551 bn_add_words_internal:
553 $code.=<<___ if ($flavour =~ /nubi/i);
554 .frame $sp,6*$SZREG,$ra
555 .mask 0x8000f008,-$SZREG
557 $PTR_SUB $sp,6*$SZREG
558 $REG_S $ra,5*$SZREG($sp)
559 $REG_S $t3,4*$SZREG($sp)
560 $REG_S $t2,3*$SZREG($sp)
561 $REG_S $t1,2*$SZREG($sp)
562 $REG_S $t0,1*$SZREG($sp)
563 $REG_S $gp,0*$SZREG($sp)
569 beqz $at,.L_bn_add_words_tail
571 .L_bn_add_words_loop:
581 $LD $ta1,-3*$BNSZ($a2)
583 $LD $ta2,-2*$BNSZ($a2)
589 $ST $t0,-4*$BNSZ($a0)
596 $ST $t1,-3*$BNSZ($a0)
603 $ST $t2,-2*$BNSZ($a0)
613 bgtz $at,.L_bn_add_words_loop
616 beqz $a3,.L_bn_add_words_return
619 .L_bn_add_words_tail:
630 beqz $a3,.L_bn_add_words_return
641 beqz $a3,.L_bn_add_words_return
644 $LD $ta2,2*$BNSZ($a2)
652 .L_bn_add_words_return:
655 $code.=<<___ if ($flavour =~ /nubi/i);
656 $REG_L $t3,4*$SZREG($sp)
657 $REG_L $t2,3*$SZREG($sp)
658 $REG_L $t1,2*$SZREG($sp)
659 $REG_L $t0,1*$SZREG($sp)
660 $REG_L $gp,0*$SZREG($sp)
661 $PTR_ADD $sp,6*$SZREG
667 .end bn_add_words_internal
674 bgtz $a3,bn_sub_words_internal
681 .ent bn_sub_words_internal
682 bn_sub_words_internal:
684 $code.=<<___ if ($flavour =~ /nubi/i);
685 .frame $sp,6*$SZREG,$ra
686 .mask 0x8000f008,-$SZREG
688 $PTR_SUB $sp,6*$SZREG
689 $REG_S $ra,5*$SZREG($sp)
690 $REG_S $t3,4*$SZREG($sp)
691 $REG_S $t2,3*$SZREG($sp)
692 $REG_S $t1,2*$SZREG($sp)
693 $REG_S $t0,1*$SZREG($sp)
694 $REG_S $gp,0*$SZREG($sp)
700 beqz $at,.L_bn_sub_words_tail
702 .L_bn_sub_words_loop:
712 $LD $ta1,-3*$BNSZ($a2)
714 $LD $ta2,-2*$BNSZ($a2)
720 $ST $t0,-4*$BNSZ($a0)
727 $ST $t1,-3*$BNSZ($a0)
735 $ST $t2,-2*$BNSZ($a0)
745 bgtz $at,.L_bn_sub_words_loop
748 beqz $a3,.L_bn_sub_words_return
751 .L_bn_sub_words_tail:
762 beqz $a3,.L_bn_sub_words_return
773 beqz $a3,.L_bn_sub_words_return
776 $LD $ta2,2*$BNSZ($a2)
784 .L_bn_sub_words_return:
787 $code.=<<___ if ($flavour =~ /nubi/i);
788 $REG_L $t3,4*$SZREG($sp)
789 $REG_L $t2,3*$SZREG($sp)
790 $REG_L $t1,2*$SZREG($sp)
791 $REG_L $t0,1*$SZREG($sp)
792 $REG_L $gp,0*$SZREG($sp)
793 $PTR_ADD $sp,6*$SZREG
798 .end bn_sub_words_internal
802 * The bn_div_3_words entry point is re-used for constant-time interface.
803 * Implementation is retained as hystorical reference.
806 .globl bn_div_3_words
810 move $a3,$a0 # we know that bn_div_words does not
811 # touch $a3, $ta2, $ta3 and preserves $a2
812 # so that we can save two arguments
813 # and return address in registers
814 # instead of stack:-)
818 bne $a0,$a2,bn_div_3_words_internal
826 .ent bn_div_3_words_internal
827 bn_div_3_words_internal:
829 $code.=<<___ if ($flavour =~ /nubi/i);
830 .frame $sp,6*$SZREG,$ra
831 .mask 0x8000f008,-$SZREG
833 $PTR_SUB $sp,6*$SZREG
834 $REG_S $ra,5*$SZREG($sp)
835 $REG_S $t3,4*$SZREG($sp)
836 $REG_S $t2,3*$SZREG($sp)
837 $REG_S $t1,2*$SZREG($sp)
838 $REG_S $t0,1*$SZREG($sp)
839 $REG_S $gp,0*$SZREG($sp)
844 bal bn_div_words_internal
847 $LD $t2,-2*$BNSZ($a3)
852 .L_bn_div_3_words_inner_loop:
853 bnez $t8,.L_bn_div_3_words_inner_loop_done
865 beqz $at,.L_bn_div_3_words_inner_loop
869 .L_bn_div_3_words_inner_loop_done:
872 $code.=<<___ if ($flavour =~ /nubi/i);
873 $REG_L $t3,4*$SZREG($sp)
874 $REG_L $t2,3*$SZREG($sp)
875 $REG_L $t1,2*$SZREG($sp)
876 $REG_L $t0,1*$SZREG($sp)
877 $REG_L $gp,0*$SZREG($sp)
878 $PTR_ADD $sp,6*$SZREG
883 .end bn_div_3_words_internal
891 bnez $a2,bn_div_words_internal
892 li $v0,-1 # I would rather signal div-by-zero
893 # which can be done with 'break 7'
899 .ent bn_div_words_internal
900 bn_div_words_internal:
902 $code.=<<___ if ($flavour =~ /nubi/i);
903 .frame $sp,6*$SZREG,$ra
904 .mask 0x8000f008,-$SZREG
906 $PTR_SUB $sp,6*$SZREG
907 $REG_S $ra,5*$SZREG($sp)
908 $REG_S $t3,4*$SZREG($sp)
909 $REG_S $t2,3*$SZREG($sp)
910 $REG_S $t1,2*$SZREG($sp)
911 $REG_S $t0,1*$SZREG($sp)
912 $REG_S $gp,0*$SZREG($sp)
916 bltz $a2,.L_bn_div_words_body
931 break 6 # signal overflow
941 .L_bn_div_words_body:
942 $SRL $DH,$a2,4*$BNSZ # bits
951 $SRL $HH,$a0,4*$BNSZ # bits
952 $SRL $QT,4*$BNSZ # q=0xffffffff
953 beq $DH,$HH,.L_bn_div_words_skip_div1
956 .L_bn_div_words_skip_div1:
958 $SLL $t3,$a0,4*$BNSZ # bits
959 $SRL $at,$a1,4*$BNSZ # bits
963 .L_bn_div_words_inner_loop1:
971 beqz $at,.L_bn_div_words_inner_loop1_done
974 b .L_bn_div_words_inner_loop1
977 .L_bn_div_words_inner_loop1_done:
979 $SLL $a1,4*$BNSZ # bits
981 $SLL $v0,$QT,4*$BNSZ # bits
984 $SRL $HH,$a0,4*$BNSZ # bits
985 $SRL $QT,4*$BNSZ # q=0xffffffff
986 beq $DH,$HH,.L_bn_div_words_skip_div2
989 .L_bn_div_words_skip_div2:
991 $SLL $t3,$a0,4*$BNSZ # bits
992 $SRL $at,$a1,4*$BNSZ # bits
996 .L_bn_div_words_inner_loop2:
1004 beqz $at,.L_bn_div_words_inner_loop2_done
1007 b .L_bn_div_words_inner_loop2
1010 .L_bn_div_words_inner_loop2_done:
1014 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1015 $SRL $a2,$t9 # restore $a2
1020 $code.=<<___ if ($flavour =~ /nubi/i);
1021 $REG_L $t3,4*$SZREG($sp)
1022 $REG_L $t2,3*$SZREG($sp)
1023 $REG_L $t1,2*$SZREG($sp)
1024 $REG_L $t0,1*$SZREG($sp)
1025 $REG_L $gp,0*$SZREG($sp)
1026 $PTR_ADD $sp,6*$SZREG
1031 .end bn_div_words_internal
1033 undef $HH; undef $QT; undef $DH;
1035 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1036 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1038 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1039 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1041 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1046 .globl bn_mul_comba8
1051 $code.=<<___ if ($flavour =~ /nubi/i);
1052 .frame $sp,12*$SZREG,$ra
1053 .mask 0x803ff008,-$SZREG
1054 $PTR_SUB $sp,12*$SZREG
1055 $REG_S $ra,11*$SZREG($sp)
1056 $REG_S $s5,10*$SZREG($sp)
1057 $REG_S $s4,9*$SZREG($sp)
1058 $REG_S $s3,8*$SZREG($sp)
1059 $REG_S $s2,7*$SZREG($sp)
1060 $REG_S $s1,6*$SZREG($sp)
1061 $REG_S $s0,5*$SZREG($sp)
1062 $REG_S $t3,4*$SZREG($sp)
1063 $REG_S $t2,3*$SZREG($sp)
1064 $REG_S $t1,2*$SZREG($sp)
1065 $REG_S $t0,1*$SZREG($sp)
1066 $REG_S $gp,0*$SZREG($sp)
1068 $code.=<<___ if ($flavour !~ /nubi/i);
1069 .frame $sp,6*$SZREG,$ra
1070 .mask 0x003f0000,-$SZREG
1071 $PTR_SUB $sp,6*$SZREG
1072 $REG_S $s5,5*$SZREG($sp)
1073 $REG_S $s4,4*$SZREG($sp)
1074 $REG_S $s3,3*$SZREG($sp)
1075 $REG_S $s2,2*$SZREG($sp)
1076 $REG_S $s1,1*$SZREG($sp)
1077 $REG_S $s0,0*$SZREG($sp)
1082 $LD $a_0,0($a1) # If compiled with -mips3 option on
1083 # R5000 box assembler barks on this
1084 # 1ine with "should not have mult/div
1085 # as last instruction in bb (R10K
1086 # bug)" warning. If anybody out there
1087 # has a clue about how to circumvent
1088 # this do send me a note.
1089 # <appro\@fy.chalmers.se>
1093 $LD $a_2,2*$BNSZ($a1)
1094 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1095 $LD $a_3,3*$BNSZ($a1)
1097 $LD $b_2,2*$BNSZ($a2)
1098 $LD $b_3,3*$BNSZ($a2)
1099 mflo ($c_1,$a_0,$b_0)
1100 mfhi ($c_2,$a_0,$b_0)
1102 $LD $a_4,4*$BNSZ($a1)
1103 $LD $a_5,5*$BNSZ($a1)
1104 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1105 $LD $a_6,6*$BNSZ($a1)
1106 $LD $a_7,7*$BNSZ($a1)
1107 $LD $b_4,4*$BNSZ($a2)
1108 $LD $b_5,5*$BNSZ($a2)
1109 mflo ($t_1,$a_0,$b_1)
1110 mfhi ($t_2,$a_0,$b_1)
1113 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1115 $LD $b_6,6*$BNSZ($a2)
1116 $LD $b_7,7*$BNSZ($a2)
1117 $ST $c_1,0($a0) # r[0]=c1;
1118 mflo ($t_1,$a_1,$b_0)
1119 mfhi ($t_2,$a_1,$b_0)
1122 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1126 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1128 mflo ($t_1,$a_2,$b_0)
1129 mfhi ($t_2,$a_2,$b_0)
1132 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1135 mflo ($t_1,$a_1,$b_1)
1136 mfhi ($t_2,$a_1,$b_1)
1139 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1143 mflo ($t_1,$a_0,$b_2)
1144 mfhi ($t_2,$a_0,$b_2)
1147 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1152 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1154 mflo ($t_1,$a_0,$b_3)
1155 mfhi ($t_2,$a_0,$b_3)
1158 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1162 mflo ($t_1,$a_1,$b_2)
1163 mfhi ($t_2,$a_1,$b_2)
1166 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1171 mflo ($t_1,$a_2,$b_1)
1172 mfhi ($t_2,$a_2,$b_1)
1175 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1180 mflo ($t_1,$a_3,$b_0)
1181 mfhi ($t_2,$a_3,$b_0)
1184 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
1189 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1191 mflo ($t_1,$a_4,$b_0)
1192 mfhi ($t_2,$a_4,$b_0)
1195 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1199 mflo ($t_1,$a_3,$b_1)
1200 mfhi ($t_2,$a_3,$b_1)
1203 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1208 mflo ($t_1,$a_2,$b_2)
1209 mfhi ($t_2,$a_2,$b_2)
1212 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1217 mflo ($t_1,$a_1,$b_3)
1218 mfhi ($t_2,$a_1,$b_3)
1221 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
1226 mflo ($t_1,$a_0,$b_4)
1227 mfhi ($t_2,$a_0,$b_4)
1230 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
1235 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1237 mflo ($t_1,$a_0,$b_5)
1238 mfhi ($t_2,$a_0,$b_5)
1241 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
1245 mflo ($t_1,$a_1,$b_4)
1246 mfhi ($t_2,$a_1,$b_4)
1249 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1254 mflo ($t_1,$a_2,$b_3)
1255 mfhi ($t_2,$a_2,$b_3)
1258 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1263 mflo ($t_1,$a_3,$b_2)
1264 mfhi ($t_2,$a_3,$b_2)
1267 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
1272 mflo ($t_1,$a_4,$b_1)
1273 mfhi ($t_2,$a_4,$b_1)
1276 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
1281 mflo ($t_1,$a_5,$b_0)
1282 mfhi ($t_2,$a_5,$b_0)
1285 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
1290 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1292 mflo ($t_1,$a_6,$b_0)
1293 mfhi ($t_2,$a_6,$b_0)
1296 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
1300 mflo ($t_1,$a_5,$b_1)
1301 mfhi ($t_2,$a_5,$b_1)
1304 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
1309 mflo ($t_1,$a_4,$b_2)
1310 mfhi ($t_2,$a_4,$b_2)
1313 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1318 mflo ($t_1,$a_3,$b_3)
1319 mfhi ($t_2,$a_3,$b_3)
1322 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
1327 mflo ($t_1,$a_2,$b_4)
1328 mfhi ($t_2,$a_2,$b_4)
1331 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
1336 mflo ($t_1,$a_1,$b_5)
1337 mfhi ($t_2,$a_1,$b_5)
1340 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
1345 mflo ($t_1,$a_0,$b_6)
1346 mfhi ($t_2,$a_0,$b_6)
1349 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
1354 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1356 mflo ($t_1,$a_0,$b_7)
1357 mfhi ($t_2,$a_0,$b_7)
1360 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
1364 mflo ($t_1,$a_1,$b_6)
1365 mfhi ($t_2,$a_1,$b_6)
1368 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
1373 mflo ($t_1,$a_2,$b_5)
1374 mfhi ($t_2,$a_2,$b_5)
1377 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
1382 mflo ($t_1,$a_3,$b_4)
1383 mfhi ($t_2,$a_3,$b_4)
1386 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
1391 mflo ($t_1,$a_4,$b_3)
1392 mfhi ($t_2,$a_4,$b_3)
1395 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
1400 mflo ($t_1,$a_5,$b_2)
1401 mfhi ($t_2,$a_5,$b_2)
1404 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
1409 mflo ($t_1,$a_6,$b_1)
1410 mfhi ($t_2,$a_6,$b_1)
1413 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
1418 mflo ($t_1,$a_7,$b_0)
1419 mfhi ($t_2,$a_7,$b_0)
1422 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
1427 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1429 mflo ($t_1,$a_7,$b_1)
1430 mfhi ($t_2,$a_7,$b_1)
1433 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
1437 mflo ($t_1,$a_6,$b_2)
1438 mfhi ($t_2,$a_6,$b_2)
1441 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
1446 mflo ($t_1,$a_5,$b_3)
1447 mfhi ($t_2,$a_5,$b_3)
1450 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
1455 mflo ($t_1,$a_4,$b_4)
1456 mfhi ($t_2,$a_4,$b_4)
1459 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
1464 mflo ($t_1,$a_3,$b_5)
1465 mfhi ($t_2,$a_3,$b_5)
1468 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
1473 mflo ($t_1,$a_2,$b_6)
1474 mfhi ($t_2,$a_2,$b_6)
1477 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
1482 mflo ($t_1,$a_1,$b_7)
1483 mfhi ($t_2,$a_1,$b_7)
1486 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
1491 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1493 mflo ($t_1,$a_2,$b_7)
1494 mfhi ($t_2,$a_2,$b_7)
1497 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
1501 mflo ($t_1,$a_3,$b_6)
1502 mfhi ($t_2,$a_3,$b_6)
1505 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
1510 mflo ($t_1,$a_4,$b_5)
1511 mfhi ($t_2,$a_4,$b_5)
1514 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
1519 mflo ($t_1,$a_5,$b_4)
1520 mfhi ($t_2,$a_5,$b_4)
1523 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
1528 mflo ($t_1,$a_6,$b_3)
1529 mfhi ($t_2,$a_6,$b_3)
1532 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
1537 mflo ($t_1,$a_7,$b_2)
1538 mfhi ($t_2,$a_7,$b_2)
1541 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
1546 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1548 mflo ($t_1,$a_7,$b_3)
1549 mfhi ($t_2,$a_7,$b_3)
1552 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
1556 mflo ($t_1,$a_6,$b_4)
1557 mfhi ($t_2,$a_6,$b_4)
1560 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
1565 mflo ($t_1,$a_5,$b_5)
1566 mfhi ($t_2,$a_5,$b_5)
1569 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
1574 mflo ($t_1,$a_4,$b_6)
1575 mfhi ($t_2,$a_4,$b_6)
1578 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
1583 mflo ($t_1,$a_3,$b_7)
1584 mfhi ($t_2,$a_3,$b_7)
1587 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
1592 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1594 mflo ($t_1,$a_4,$b_7)
1595 mfhi ($t_2,$a_4,$b_7)
1598 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
1602 mflo ($t_1,$a_5,$b_6)
1603 mfhi ($t_2,$a_5,$b_6)
1606 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
1611 mflo ($t_1,$a_6,$b_5)
1612 mfhi ($t_2,$a_6,$b_5)
1615 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
1620 mflo ($t_1,$a_7,$b_4)
1621 mfhi ($t_2,$a_7,$b_4)
1624 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
1629 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1631 mflo ($t_1,$a_7,$b_5)
1632 mfhi ($t_2,$a_7,$b_5)
1635 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
1639 mflo ($t_1,$a_6,$b_6)
1640 mfhi ($t_2,$a_6,$b_6)
1643 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
1648 mflo ($t_1,$a_5,$b_7)
1649 mfhi ($t_2,$a_5,$b_7)
1652 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
1657 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1659 mflo ($t_1,$a_6,$b_7)
1660 mfhi ($t_2,$a_6,$b_7)
1663 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
1667 mflo ($t_1,$a_7,$b_6)
1668 mfhi ($t_2,$a_7,$b_6)
1671 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
1676 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1678 mflo ($t_1,$a_7,$b_7)
1679 mfhi ($t_2,$a_7,$b_7)
1684 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1685 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1689 $code.=<<___ if ($flavour =~ /nubi/i);
1690 $REG_L $s5,10*$SZREG($sp)
1691 $REG_L $s4,9*$SZREG($sp)
1692 $REG_L $s3,8*$SZREG($sp)
1693 $REG_L $s2,7*$SZREG($sp)
1694 $REG_L $s1,6*$SZREG($sp)
1695 $REG_L $s0,5*$SZREG($sp)
1696 $REG_L $t3,4*$SZREG($sp)
1697 $REG_L $t2,3*$SZREG($sp)
1698 $REG_L $t1,2*$SZREG($sp)
1699 $REG_L $t0,1*$SZREG($sp)
1700 $REG_L $gp,0*$SZREG($sp)
1702 $PTR_ADD $sp,12*$SZREG
1704 $code.=<<___ if ($flavour !~ /nubi/i);
1705 $REG_L $s5,5*$SZREG($sp)
1706 $REG_L $s4,4*$SZREG($sp)
1707 $REG_L $s3,3*$SZREG($sp)
1708 $REG_L $s2,2*$SZREG($sp)
1709 $REG_L $s1,1*$SZREG($sp)
1710 $REG_L $s0,0*$SZREG($sp)
1712 $PTR_ADD $sp,6*$SZREG
1718 .globl bn_mul_comba4
1722 $code.=<<___ if ($flavour =~ /nubi/i);
1723 .frame $sp,6*$SZREG,$ra
1724 .mask 0x8000f008,-$SZREG
1726 $PTR_SUB $sp,6*$SZREG
1727 $REG_S $ra,5*$SZREG($sp)
1728 $REG_S $t3,4*$SZREG($sp)
1729 $REG_S $t2,3*$SZREG($sp)
1730 $REG_S $t1,2*$SZREG($sp)
1731 $REG_S $t0,1*$SZREG($sp)
1732 $REG_S $gp,0*$SZREG($sp)
1739 $LD $a_2,2*$BNSZ($a1)
1740 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1741 $LD $a_3,3*$BNSZ($a1)
1743 $LD $b_2,2*$BNSZ($a2)
1744 $LD $b_3,3*$BNSZ($a2)
1745 mflo ($c_1,$a_0,$b_0)
1746 mfhi ($c_2,$a_0,$b_0)
1749 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1750 mflo ($t_1,$a_0,$b_1)
1751 mfhi ($t_2,$a_0,$b_1)
1754 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1756 mflo ($t_1,$a_1,$b_0)
1757 mfhi ($t_2,$a_1,$b_0)
1760 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1766 mflo ($t_1,$a_2,$b_0)
1767 mfhi ($t_2,$a_2,$b_0)
1770 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1773 mflo ($t_1,$a_1,$b_1)
1774 mfhi ($t_2,$a_1,$b_1)
1777 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1781 mflo ($t_1,$a_0,$b_2)
1782 mfhi ($t_2,$a_0,$b_2)
1785 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1790 $ST $c_3,2*$BNSZ($a0)
1792 mflo ($t_1,$a_0,$b_3)
1793 mfhi ($t_2,$a_0,$b_3)
1796 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1800 mflo ($t_1,$a_1,$b_2)
1801 mfhi ($t_2,$a_1,$b_2)
1804 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1809 mflo ($t_1,$a_2,$b_1)
1810 mfhi ($t_2,$a_2,$b_1)
1813 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1818 mflo ($t_1,$a_3,$b_0)
1819 mfhi ($t_2,$a_3,$b_0)
1822 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1827 $ST $c_1,3*$BNSZ($a0)
1829 mflo ($t_1,$a_3,$b_1)
1830 mfhi ($t_2,$a_3,$b_1)
1833 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1837 mflo ($t_1,$a_2,$b_2)
1838 mfhi ($t_2,$a_2,$b_2)
1841 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1846 mflo ($t_1,$a_1,$b_3)
1847 mfhi ($t_2,$a_1,$b_3)
1850 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1855 $ST $c_2,4*$BNSZ($a0)
1857 mflo ($t_1,$a_2,$b_3)
1858 mfhi ($t_2,$a_2,$b_3)
1861 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1865 mflo ($t_1,$a_3,$b_2)
1866 mfhi ($t_2,$a_3,$b_2)
1869 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1874 $ST $c_3,5*$BNSZ($a0)
1876 mflo ($t_1,$a_3,$b_3)
1877 mfhi ($t_2,$a_3,$b_3)
1882 $ST $c_1,6*$BNSZ($a0)
1883 $ST $c_2,7*$BNSZ($a0)
1887 $code.=<<___ if ($flavour =~ /nubi/i);
1888 $REG_L $t3,4*$SZREG($sp)
1889 $REG_L $t2,3*$SZREG($sp)
1890 $REG_L $t1,2*$SZREG($sp)
1891 $REG_L $t0,1*$SZREG($sp)
1892 $REG_L $gp,0*$SZREG($sp)
1893 $PTR_ADD $sp,6*$SZREG
1901 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1904 my ($hi,$lo,$c0,$c1,$c2,
1905 $warm, # !$warm denotes first call with specific sequence of
1906 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1907 $an,$bn # these two are arguments for multiplication which
1908 # result is used in *next* step [which is why it's
1909 # commented as "forward multiplication" below];
1914 $MULTU ($an,$bn) # forward multiplication
1921 $code.=<<___ if (!$warm);
1925 $code.=<<___ if ($warm);
1941 .globl bn_sqr_comba8
1945 $code.=<<___ if ($flavour =~ /nubi/i);
1946 .frame $sp,6*$SZREG,$ra
1947 .mask 0x8000f008,-$SZREG
1949 $PTR_SUB $sp,6*$SZREG
1950 $REG_S $ra,5*$SZREG($sp)
1951 $REG_S $t3,4*$SZREG($sp)
1952 $REG_S $t2,3*$SZREG($sp)
1953 $REG_S $t1,2*$SZREG($sp)
1954 $REG_S $t0,1*$SZREG($sp)
1955 $REG_S $gp,0*$SZREG($sp)
1961 $LD $a_2,2*$BNSZ($a1)
1962 $LD $a_3,3*$BNSZ($a1)
1964 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1965 $LD $a_4,4*$BNSZ($a1)
1966 $LD $a_5,5*$BNSZ($a1)
1967 $LD $a_6,6*$BNSZ($a1)
1968 $LD $a_7,7*$BNSZ($a1)
1969 mflo ($c_1,$a_0,$a_0)
1970 mfhi ($c_2,$a_0,$a_0)
1973 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1974 mflo ($t_1,$a_0,$a_1)
1975 mfhi ($t_2,$a_0,$a_1)
1978 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
1986 mflo ($t_1,$a_2,$a_0)
1987 mfhi ($t_2,$a_2,$a_0)
1989 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1990 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1994 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
1999 $ST $c_3,2*$BNSZ($a0)
2000 mflo ($t_1,$a_0,$a_3)
2001 mfhi ($t_2,$a_0,$a_3)
2003 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2004 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2005 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2006 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2008 $ST $c_1,3*$BNSZ($a0)
2010 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2011 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2012 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2013 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2017 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
2022 $ST $c_2,4*$BNSZ($a0)
2023 mflo ($t_1,$a_0,$a_5)
2024 mfhi ($t_2,$a_0,$a_5)
2026 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2027 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2028 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2029 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2030 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2031 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2033 $ST $c_3,5*$BNSZ($a0)
2035 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2036 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2037 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2038 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2039 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2040 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2044 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
2049 $ST $c_1,6*$BNSZ($a0)
2050 mflo ($t_1,$a_0,$a_7)
2051 mfhi ($t_2,$a_0,$a_7)
2053 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2054 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2055 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2056 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2057 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2058 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2059 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2060 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2062 $ST $c_2,7*$BNSZ($a0)
2064 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2065 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2066 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2067 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2068 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2069 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2073 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
2078 $ST $c_3,8*$BNSZ($a0)
2079 mflo ($t_1,$a_2,$a_7)
2080 mfhi ($t_2,$a_2,$a_7)
2082 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2083 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2084 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2085 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2086 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2087 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2089 $ST $c_1,9*$BNSZ($a0)
2091 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2092 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2093 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2094 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2098 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
2103 $ST $c_2,10*$BNSZ($a0)
2104 mflo ($t_1,$a_4,$a_7)
2105 mfhi ($t_2,$a_4,$a_7)
2107 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2108 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2109 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2110 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2112 $ST $c_3,11*$BNSZ($a0)
2114 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2115 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2119 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
2124 $ST $c_1,12*$BNSZ($a0)
2125 mflo ($t_1,$a_6,$a_7)
2126 mfhi ($t_2,$a_6,$a_7)
2128 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2129 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2131 $ST $c_2,13*$BNSZ($a0)
2137 $ST $c_3,14*$BNSZ($a0)
2138 $ST $c_1,15*$BNSZ($a0)
2142 $code.=<<___ if ($flavour =~ /nubi/i);
2143 $REG_L $t3,4*$SZREG($sp)
2144 $REG_L $t2,3*$SZREG($sp)
2145 $REG_L $t1,2*$SZREG($sp)
2146 $REG_L $t0,1*$SZREG($sp)
2147 $REG_L $gp,0*$SZREG($sp)
2148 $PTR_ADD $sp,6*$SZREG
2156 .globl bn_sqr_comba4
2160 $code.=<<___ if ($flavour =~ /nubi/i);
2161 .frame $sp,6*$SZREG,$ra
2162 .mask 0x8000f008,-$SZREG
2164 $PTR_SUB $sp,6*$SZREG
2165 $REG_S $ra,5*$SZREG($sp)
2166 $REG_S $t3,4*$SZREG($sp)
2167 $REG_S $t2,3*$SZREG($sp)
2168 $REG_S $t1,2*$SZREG($sp)
2169 $REG_S $t0,1*$SZREG($sp)
2170 $REG_S $gp,0*$SZREG($sp)
2176 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
2177 $LD $a_2,2*$BNSZ($a1)
2178 $LD $a_3,3*$BNSZ($a1)
2179 mflo ($c_1,$a_0,$a_0)
2180 mfhi ($c_2,$a_0,$a_0)
2183 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2184 mflo ($t_1,$a_0,$a_1)
2185 mfhi ($t_2,$a_0,$a_1)
2188 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
2196 mflo ($t_1,$a_2,$a_0)
2197 mfhi ($t_2,$a_2,$a_0)
2199 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2200 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2204 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2209 $ST $c_3,2*$BNSZ($a0)
2210 mflo ($t_1,$a_0,$a_3)
2211 mfhi ($t_2,$a_0,$a_3)
2213 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2214 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2215 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2216 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2218 $ST $c_1,3*$BNSZ($a0)
2220 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2221 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2225 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
2230 $ST $c_2,4*$BNSZ($a0)
2231 mflo ($t_1,$a_2,$a_3)
2232 mfhi ($t_2,$a_2,$a_3)
2234 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2235 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2237 $ST $c_3,5*$BNSZ($a0)
2243 $ST $c_1,6*$BNSZ($a0)
2244 $ST $c_2,7*$BNSZ($a0)
2248 $code.=<<___ if ($flavour =~ /nubi/i);
2249 $REG_L $t3,4*$SZREG($sp)
2250 $REG_L $t2,3*$SZREG($sp)
2251 $REG_L $t1,2*$SZREG($sp)
2252 $REG_L $t0,1*$SZREG($sp)
2253 $REG_L $gp,0*$SZREG($sp)
2254 $PTR_ADD $sp,6*$SZREG