2 # Copyright 2010-2019 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # Rights for redistribution and usage in source and binary forms are
15 # granted according to the OpenSSL license. Warranty of any kind is
17 # ====================================================================
22 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
24 # The module is designed to work with either of the "new" MIPS ABI(5),
25 # namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
26 # IRIX 5.x not only because it doesn't support new ABIs but also
27 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
28 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
29 # cause illegal instruction exception:-(
31 # In addition the code depends on preprocessor flags set up by MIPSpro
32 # compiler driver (either as or cc) and therefore (probably?) can't be
33 # compiled by the GNU assembler. GNU C driver manages fine though...
34 # I mean as long as -mmips-as is specified or is the default option,
35 # because then it simply invokes /usr/bin/as which in turn takes
36 # perfect care of the preprocessor definitions. Another neat feature
37 # offered by the MIPSpro assembler is an optimization pass. This gave
38 # me the opportunity to have the code looking more regular as all those
39 # architecture dependent instruction rescheduling details were left to
40 # the assembler. Cool, huh?
42 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
43 # goes way over 3 times faster!
49 # Adapt the module even for 32-bit ABIs and other OSes. The former was
50 # achieved by mechanical replacement of 64-bit arithmetic instructions
51 # such as dmultu, daddu, etc. with their 32-bit counterparts and
52 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
53 # >3x performance improvement naturally does not apply to 32-bit code
54 # [because there is no instruction 32-bit compiler can't use], one
55 # has to content with 40-85% improvement depending on benchmark and
56 # key length, more for longer keys.
58 $flavour = shift || "o32";
59 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
60 open STDOUT,">$output";
62 if ($flavour =~ /64|n32/i) {
92 $code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n";
95 # Below is N32/64 register layout used in the original module.
97 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
98 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
99 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
100 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
101 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
102 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
104 # No special adaptation is required for O32. NUBI on the other hand
105 # is treated by saving/restoring ($v1,$t0..$t3).
107 $gp=$v1 if ($flavour =~ /nubi/i);
112 #include "mips_arch.h"
114 #if defined(_MIPS_ARCH_MIPS64R6)
115 # define ddivu(rs,rt)
116 # define mfqt(rd,rs,rt) ddivu rd,rs,rt
117 # define mfrm(rd,rs,rt) dmodu rd,rs,rt
118 #elif defined(_MIPS_ARCH_MIPS32R6)
120 # define mfqt(rd,rs,rt) divu rd,rs,rt
121 # define mfrm(rd,rs,rt) modu rd,rs,rt
123 # define $DIVU(rs,rt) $DIVU $zero,rs,rt
124 # define mfqt(rd,rs,rt) mflo rd
125 # define mfrm(rd,rs,rt) mfhi rd
129 .asciiz "mips3.s, Version 1.2"
130 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
136 .globl bn_mul_add_words
137 .ent bn_mul_add_words
140 bgtz $a2,bn_mul_add_words_internal
144 .end bn_mul_add_words
147 .ent bn_mul_add_words_internal
148 bn_mul_add_words_internal:
150 $code.=<<___ if ($flavour =~ /nubi/i);
151 .frame $sp,6*$SZREG,$ra
152 .mask 0x8000f008,-$SZREG
154 $PTR_SUB $sp,6*$SZREG
155 $REG_S $ra,5*$SZREG($sp)
156 $REG_S $t3,4*$SZREG($sp)
157 $REG_S $t2,3*$SZREG($sp)
158 $REG_S $t1,2*$SZREG($sp)
159 $REG_S $t0,1*$SZREG($sp)
160 $REG_S $gp,0*$SZREG($sp)
166 beqz $ta0,.L_bn_mul_add_words_tail
168 .L_bn_mul_add_words_loop:
174 $LD $ta0,2*$BNSZ($a1)
175 $LD $ta1,2*$BNSZ($a0)
177 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
178 # values", but it seems to work fine
179 # even on 64-bit registers.
189 $LD $ta2,3*$BNSZ($a1)
190 $LD $ta3,3*$BNSZ($a0)
213 $ST $ta1,-2*$BNSZ($a0)
227 bgtz $ta0,.L_bn_mul_add_words_loop
230 beqz $a2,.L_bn_mul_add_words_return
233 .L_bn_mul_add_words_tail:
248 beqz $a2,.L_bn_mul_add_words_return
263 beqz $a2,.L_bn_mul_add_words_return
278 .L_bn_mul_add_words_return:
281 $code.=<<___ if ($flavour =~ /nubi/i);
282 $REG_L $t3,4*$SZREG($sp)
283 $REG_L $t2,3*$SZREG($sp)
284 $REG_L $t1,2*$SZREG($sp)
285 $REG_L $t0,1*$SZREG($sp)
286 $REG_L $gp,0*$SZREG($sp)
287 $PTR_ADD $sp,6*$SZREG
292 .end bn_mul_add_words_internal
299 bgtz $a2,bn_mul_words_internal
306 .ent bn_mul_words_internal
307 bn_mul_words_internal:
309 $code.=<<___ if ($flavour =~ /nubi/i);
310 .frame $sp,6*$SZREG,$ra
311 .mask 0x8000f008,-$SZREG
313 $PTR_SUB $sp,6*$SZREG
314 $REG_S $ra,5*$SZREG($sp)
315 $REG_S $t3,4*$SZREG($sp)
316 $REG_S $t2,3*$SZREG($sp)
317 $REG_S $t1,2*$SZREG($sp)
318 $REG_S $t0,1*$SZREG($sp)
319 $REG_S $gp,0*$SZREG($sp)
325 beqz $ta0,.L_bn_mul_words_tail
327 .L_bn_mul_words_loop:
331 $LD $ta0,2*$BNSZ($a1)
332 $LD $ta2,3*$BNSZ($a1)
349 $ST $v0,-3*$BNSZ($a0)
357 $ST $v0,-2*$BNSZ($a0)
367 bgtz $ta0,.L_bn_mul_words_loop
370 beqz $a2,.L_bn_mul_words_return
373 .L_bn_mul_words_tail:
384 beqz $a2,.L_bn_mul_words_return
395 beqz $a2,.L_bn_mul_words_return
406 .L_bn_mul_words_return:
409 $code.=<<___ if ($flavour =~ /nubi/i);
410 $REG_L $t3,4*$SZREG($sp)
411 $REG_L $t2,3*$SZREG($sp)
412 $REG_L $t1,2*$SZREG($sp)
413 $REG_L $t0,1*$SZREG($sp)
414 $REG_L $gp,0*$SZREG($sp)
415 $PTR_ADD $sp,6*$SZREG
420 .end bn_mul_words_internal
427 bgtz $a2,bn_sqr_words_internal
434 .ent bn_sqr_words_internal
435 bn_sqr_words_internal:
437 $code.=<<___ if ($flavour =~ /nubi/i);
438 .frame $sp,6*$SZREG,$ra
439 .mask 0x8000f008,-$SZREG
441 $PTR_SUB $sp,6*$SZREG
442 $REG_S $ra,5*$SZREG($sp)
443 $REG_S $t3,4*$SZREG($sp)
444 $REG_S $t2,3*$SZREG($sp)
445 $REG_S $t1,2*$SZREG($sp)
446 $REG_S $t0,1*$SZREG($sp)
447 $REG_S $gp,0*$SZREG($sp)
453 beqz $ta0,.L_bn_sqr_words_tail
455 .L_bn_sqr_words_loop:
459 $LD $ta0,2*$BNSZ($a1)
460 $LD $ta2,3*$BNSZ($a1)
472 $ST $t3,-6*$BNSZ($a0)
473 $ST $t2,-5*$BNSZ($a0)
476 mflo ($ta1,$ta0,$ta0)
477 mfhi ($ta0,$ta0,$ta0)
478 $ST $ta1,-4*$BNSZ($a0)
479 $ST $ta0,-3*$BNSZ($a0)
484 mflo ($ta3,$ta2,$ta2)
485 mfhi ($ta2,$ta2,$ta2)
486 $ST $ta3,-2*$BNSZ($a0)
489 bgtz $ta0,.L_bn_sqr_words_loop
492 beqz $a2,.L_bn_sqr_words_return
495 .L_bn_sqr_words_tail:
504 beqz $a2,.L_bn_sqr_words_return
513 beqz $a2,.L_bn_sqr_words_return
522 .L_bn_sqr_words_return:
525 $code.=<<___ if ($flavour =~ /nubi/i);
526 $REG_L $t3,4*$SZREG($sp)
527 $REG_L $t2,3*$SZREG($sp)
528 $REG_L $t1,2*$SZREG($sp)
529 $REG_L $t0,1*$SZREG($sp)
530 $REG_L $gp,0*$SZREG($sp)
531 $PTR_ADD $sp,6*$SZREG
537 .end bn_sqr_words_internal
544 bgtz $a3,bn_add_words_internal
551 .ent bn_add_words_internal
552 bn_add_words_internal:
554 $code.=<<___ if ($flavour =~ /nubi/i);
555 .frame $sp,6*$SZREG,$ra
556 .mask 0x8000f008,-$SZREG
558 $PTR_SUB $sp,6*$SZREG
559 $REG_S $ra,5*$SZREG($sp)
560 $REG_S $t3,4*$SZREG($sp)
561 $REG_S $t2,3*$SZREG($sp)
562 $REG_S $t1,2*$SZREG($sp)
563 $REG_S $t0,1*$SZREG($sp)
564 $REG_S $gp,0*$SZREG($sp)
570 beqz $at,.L_bn_add_words_tail
572 .L_bn_add_words_loop:
582 $LD $ta1,-3*$BNSZ($a2)
584 $LD $ta2,-2*$BNSZ($a2)
590 $ST $t0,-4*$BNSZ($a0)
597 $ST $t1,-3*$BNSZ($a0)
604 $ST $t2,-2*$BNSZ($a0)
614 bgtz $at,.L_bn_add_words_loop
617 beqz $a3,.L_bn_add_words_return
620 .L_bn_add_words_tail:
631 beqz $a3,.L_bn_add_words_return
642 beqz $a3,.L_bn_add_words_return
645 $LD $ta2,2*$BNSZ($a2)
653 .L_bn_add_words_return:
656 $code.=<<___ if ($flavour =~ /nubi/i);
657 $REG_L $t3,4*$SZREG($sp)
658 $REG_L $t2,3*$SZREG($sp)
659 $REG_L $t1,2*$SZREG($sp)
660 $REG_L $t0,1*$SZREG($sp)
661 $REG_L $gp,0*$SZREG($sp)
662 $PTR_ADD $sp,6*$SZREG
668 .end bn_add_words_internal
675 bgtz $a3,bn_sub_words_internal
682 .ent bn_sub_words_internal
683 bn_sub_words_internal:
685 $code.=<<___ if ($flavour =~ /nubi/i);
686 .frame $sp,6*$SZREG,$ra
687 .mask 0x8000f008,-$SZREG
689 $PTR_SUB $sp,6*$SZREG
690 $REG_S $ra,5*$SZREG($sp)
691 $REG_S $t3,4*$SZREG($sp)
692 $REG_S $t2,3*$SZREG($sp)
693 $REG_S $t1,2*$SZREG($sp)
694 $REG_S $t0,1*$SZREG($sp)
695 $REG_S $gp,0*$SZREG($sp)
701 beqz $at,.L_bn_sub_words_tail
703 .L_bn_sub_words_loop:
713 $LD $ta1,-3*$BNSZ($a2)
715 $LD $ta2,-2*$BNSZ($a2)
721 $ST $t0,-4*$BNSZ($a0)
728 $ST $t1,-3*$BNSZ($a0)
736 $ST $t2,-2*$BNSZ($a0)
746 bgtz $at,.L_bn_sub_words_loop
749 beqz $a3,.L_bn_sub_words_return
752 .L_bn_sub_words_tail:
763 beqz $a3,.L_bn_sub_words_return
774 beqz $a3,.L_bn_sub_words_return
777 $LD $ta2,2*$BNSZ($a2)
785 .L_bn_sub_words_return:
788 $code.=<<___ if ($flavour =~ /nubi/i);
789 $REG_L $t3,4*$SZREG($sp)
790 $REG_L $t2,3*$SZREG($sp)
791 $REG_L $t1,2*$SZREG($sp)
792 $REG_L $t0,1*$SZREG($sp)
793 $REG_L $gp,0*$SZREG($sp)
794 $PTR_ADD $sp,6*$SZREG
799 .end bn_sub_words_internal
803 * The bn_div_3_words entry point is re-used for constant-time interface.
804 * Implementation is retained as historical reference.
807 .globl bn_div_3_words
811 move $a3,$a0 # we know that bn_div_words does not
812 # touch $a3, $ta2, $ta3 and preserves $a2
813 # so that we can save two arguments
814 # and return address in registers
815 # instead of stack:-)
819 bne $a0,$a2,bn_div_3_words_internal
827 .ent bn_div_3_words_internal
828 bn_div_3_words_internal:
830 $code.=<<___ if ($flavour =~ /nubi/i);
831 .frame $sp,6*$SZREG,$ra
832 .mask 0x8000f008,-$SZREG
834 $PTR_SUB $sp,6*$SZREG
835 $REG_S $ra,5*$SZREG($sp)
836 $REG_S $t3,4*$SZREG($sp)
837 $REG_S $t2,3*$SZREG($sp)
838 $REG_S $t1,2*$SZREG($sp)
839 $REG_S $t0,1*$SZREG($sp)
840 $REG_S $gp,0*$SZREG($sp)
845 bal bn_div_words_internal
848 $LD $t2,-2*$BNSZ($a3)
853 .L_bn_div_3_words_inner_loop:
854 bnez $t8,.L_bn_div_3_words_inner_loop_done
866 beqz $at,.L_bn_div_3_words_inner_loop
870 .L_bn_div_3_words_inner_loop_done:
873 $code.=<<___ if ($flavour =~ /nubi/i);
874 $REG_L $t3,4*$SZREG($sp)
875 $REG_L $t2,3*$SZREG($sp)
876 $REG_L $t1,2*$SZREG($sp)
877 $REG_L $t0,1*$SZREG($sp)
878 $REG_L $gp,0*$SZREG($sp)
879 $PTR_ADD $sp,6*$SZREG
884 .end bn_div_3_words_internal
892 bnez $a2,bn_div_words_internal
893 li $v0,-1 # I would rather signal div-by-zero
894 # which can be done with 'break 7'
900 .ent bn_div_words_internal
901 bn_div_words_internal:
903 $code.=<<___ if ($flavour =~ /nubi/i);
904 .frame $sp,6*$SZREG,$ra
905 .mask 0x8000f008,-$SZREG
907 $PTR_SUB $sp,6*$SZREG
908 $REG_S $ra,5*$SZREG($sp)
909 $REG_S $t3,4*$SZREG($sp)
910 $REG_S $t2,3*$SZREG($sp)
911 $REG_S $t1,2*$SZREG($sp)
912 $REG_S $t0,1*$SZREG($sp)
913 $REG_S $gp,0*$SZREG($sp)
917 bltz $a2,.L_bn_div_words_body
932 break 6 # signal overflow
942 .L_bn_div_words_body:
943 $SRL $DH,$a2,4*$BNSZ # bits
952 $SRL $HH,$a0,4*$BNSZ # bits
953 $SRL $QT,4*$BNSZ # q=0xffffffff
954 beq $DH,$HH,.L_bn_div_words_skip_div1
957 .L_bn_div_words_skip_div1:
959 $SLL $t3,$a0,4*$BNSZ # bits
960 $SRL $at,$a1,4*$BNSZ # bits
964 .L_bn_div_words_inner_loop1:
972 beqz $at,.L_bn_div_words_inner_loop1_done
975 b .L_bn_div_words_inner_loop1
978 .L_bn_div_words_inner_loop1_done:
980 $SLL $a1,4*$BNSZ # bits
982 $SLL $v0,$QT,4*$BNSZ # bits
985 $SRL $HH,$a0,4*$BNSZ # bits
986 $SRL $QT,4*$BNSZ # q=0xffffffff
987 beq $DH,$HH,.L_bn_div_words_skip_div2
990 .L_bn_div_words_skip_div2:
992 $SLL $t3,$a0,4*$BNSZ # bits
993 $SRL $at,$a1,4*$BNSZ # bits
997 .L_bn_div_words_inner_loop2:
1005 beqz $at,.L_bn_div_words_inner_loop2_done
1008 b .L_bn_div_words_inner_loop2
1011 .L_bn_div_words_inner_loop2_done:
1015 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1016 $SRL $a2,$t9 # restore $a2
1021 $code.=<<___ if ($flavour =~ /nubi/i);
1022 $REG_L $t3,4*$SZREG($sp)
1023 $REG_L $t2,3*$SZREG($sp)
1024 $REG_L $t1,2*$SZREG($sp)
1025 $REG_L $t0,1*$SZREG($sp)
1026 $REG_L $gp,0*$SZREG($sp)
1027 $PTR_ADD $sp,6*$SZREG
1032 .end bn_div_words_internal
1034 undef $HH; undef $QT; undef $DH;
1036 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1037 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1039 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1040 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1042 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1047 .globl bn_mul_comba8
1052 $code.=<<___ if ($flavour =~ /nubi/i);
1053 .frame $sp,12*$SZREG,$ra
1054 .mask 0x803ff008,-$SZREG
1055 $PTR_SUB $sp,12*$SZREG
1056 $REG_S $ra,11*$SZREG($sp)
1057 $REG_S $s5,10*$SZREG($sp)
1058 $REG_S $s4,9*$SZREG($sp)
1059 $REG_S $s3,8*$SZREG($sp)
1060 $REG_S $s2,7*$SZREG($sp)
1061 $REG_S $s1,6*$SZREG($sp)
1062 $REG_S $s0,5*$SZREG($sp)
1063 $REG_S $t3,4*$SZREG($sp)
1064 $REG_S $t2,3*$SZREG($sp)
1065 $REG_S $t1,2*$SZREG($sp)
1066 $REG_S $t0,1*$SZREG($sp)
1067 $REG_S $gp,0*$SZREG($sp)
1069 $code.=<<___ if ($flavour !~ /nubi/i);
1070 .frame $sp,6*$SZREG,$ra
1071 .mask 0x003f0000,-$SZREG
1072 $PTR_SUB $sp,6*$SZREG
1073 $REG_S $s5,5*$SZREG($sp)
1074 $REG_S $s4,4*$SZREG($sp)
1075 $REG_S $s3,3*$SZREG($sp)
1076 $REG_S $s2,2*$SZREG($sp)
1077 $REG_S $s1,1*$SZREG($sp)
1078 $REG_S $s0,0*$SZREG($sp)
1083 $LD $a_0,0($a1) # If compiled with -mips3 option on
1084 # R5000 box assembler barks on this
1085 # 1ine with "should not have mult/div
1086 # as last instruction in bb (R10K
1087 # bug)" warning. If anybody out there
1088 # has a clue about how to circumvent
1089 # this do send me a note.
1090 # <appro\@fy.chalmers.se>
1094 $LD $a_2,2*$BNSZ($a1)
1095 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1096 $LD $a_3,3*$BNSZ($a1)
1098 $LD $b_2,2*$BNSZ($a2)
1099 $LD $b_3,3*$BNSZ($a2)
1100 mflo ($c_1,$a_0,$b_0)
1101 mfhi ($c_2,$a_0,$b_0)
1103 $LD $a_4,4*$BNSZ($a1)
1104 $LD $a_5,5*$BNSZ($a1)
1105 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1106 $LD $a_6,6*$BNSZ($a1)
1107 $LD $a_7,7*$BNSZ($a1)
1108 $LD $b_4,4*$BNSZ($a2)
1109 $LD $b_5,5*$BNSZ($a2)
1110 mflo ($t_1,$a_0,$b_1)
1111 mfhi ($t_2,$a_0,$b_1)
1114 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1116 $LD $b_6,6*$BNSZ($a2)
1117 $LD $b_7,7*$BNSZ($a2)
1118 $ST $c_1,0($a0) # r[0]=c1;
1119 mflo ($t_1,$a_1,$b_0)
1120 mfhi ($t_2,$a_1,$b_0)
1123 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1127 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1129 mflo ($t_1,$a_2,$b_0)
1130 mfhi ($t_2,$a_2,$b_0)
1133 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1136 mflo ($t_1,$a_1,$b_1)
1137 mfhi ($t_2,$a_1,$b_1)
1140 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1144 mflo ($t_1,$a_0,$b_2)
1145 mfhi ($t_2,$a_0,$b_2)
1148 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1153 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1155 mflo ($t_1,$a_0,$b_3)
1156 mfhi ($t_2,$a_0,$b_3)
1159 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1163 mflo ($t_1,$a_1,$b_2)
1164 mfhi ($t_2,$a_1,$b_2)
1167 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1172 mflo ($t_1,$a_2,$b_1)
1173 mfhi ($t_2,$a_2,$b_1)
1176 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1181 mflo ($t_1,$a_3,$b_0)
1182 mfhi ($t_2,$a_3,$b_0)
1185 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
1190 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1192 mflo ($t_1,$a_4,$b_0)
1193 mfhi ($t_2,$a_4,$b_0)
1196 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1200 mflo ($t_1,$a_3,$b_1)
1201 mfhi ($t_2,$a_3,$b_1)
1204 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1209 mflo ($t_1,$a_2,$b_2)
1210 mfhi ($t_2,$a_2,$b_2)
1213 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1218 mflo ($t_1,$a_1,$b_3)
1219 mfhi ($t_2,$a_1,$b_3)
1222 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
1227 mflo ($t_1,$a_0,$b_4)
1228 mfhi ($t_2,$a_0,$b_4)
1231 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
1236 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1238 mflo ($t_1,$a_0,$b_5)
1239 mfhi ($t_2,$a_0,$b_5)
1242 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
1246 mflo ($t_1,$a_1,$b_4)
1247 mfhi ($t_2,$a_1,$b_4)
1250 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1255 mflo ($t_1,$a_2,$b_3)
1256 mfhi ($t_2,$a_2,$b_3)
1259 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1264 mflo ($t_1,$a_3,$b_2)
1265 mfhi ($t_2,$a_3,$b_2)
1268 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
1273 mflo ($t_1,$a_4,$b_1)
1274 mfhi ($t_2,$a_4,$b_1)
1277 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
1282 mflo ($t_1,$a_5,$b_0)
1283 mfhi ($t_2,$a_5,$b_0)
1286 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
1291 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1293 mflo ($t_1,$a_6,$b_0)
1294 mfhi ($t_2,$a_6,$b_0)
1297 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
1301 mflo ($t_1,$a_5,$b_1)
1302 mfhi ($t_2,$a_5,$b_1)
1305 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
1310 mflo ($t_1,$a_4,$b_2)
1311 mfhi ($t_2,$a_4,$b_2)
1314 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1319 mflo ($t_1,$a_3,$b_3)
1320 mfhi ($t_2,$a_3,$b_3)
1323 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
1328 mflo ($t_1,$a_2,$b_4)
1329 mfhi ($t_2,$a_2,$b_4)
1332 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
1337 mflo ($t_1,$a_1,$b_5)
1338 mfhi ($t_2,$a_1,$b_5)
1341 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
1346 mflo ($t_1,$a_0,$b_6)
1347 mfhi ($t_2,$a_0,$b_6)
1350 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
1355 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1357 mflo ($t_1,$a_0,$b_7)
1358 mfhi ($t_2,$a_0,$b_7)
1361 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
1365 mflo ($t_1,$a_1,$b_6)
1366 mfhi ($t_2,$a_1,$b_6)
1369 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
1374 mflo ($t_1,$a_2,$b_5)
1375 mfhi ($t_2,$a_2,$b_5)
1378 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
1383 mflo ($t_1,$a_3,$b_4)
1384 mfhi ($t_2,$a_3,$b_4)
1387 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
1392 mflo ($t_1,$a_4,$b_3)
1393 mfhi ($t_2,$a_4,$b_3)
1396 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
1401 mflo ($t_1,$a_5,$b_2)
1402 mfhi ($t_2,$a_5,$b_2)
1405 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
1410 mflo ($t_1,$a_6,$b_1)
1411 mfhi ($t_2,$a_6,$b_1)
1414 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
1419 mflo ($t_1,$a_7,$b_0)
1420 mfhi ($t_2,$a_7,$b_0)
1423 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
1428 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1430 mflo ($t_1,$a_7,$b_1)
1431 mfhi ($t_2,$a_7,$b_1)
1434 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
1438 mflo ($t_1,$a_6,$b_2)
1439 mfhi ($t_2,$a_6,$b_2)
1442 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
1447 mflo ($t_1,$a_5,$b_3)
1448 mfhi ($t_2,$a_5,$b_3)
1451 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
1456 mflo ($t_1,$a_4,$b_4)
1457 mfhi ($t_2,$a_4,$b_4)
1460 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
1465 mflo ($t_1,$a_3,$b_5)
1466 mfhi ($t_2,$a_3,$b_5)
1469 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
1474 mflo ($t_1,$a_2,$b_6)
1475 mfhi ($t_2,$a_2,$b_6)
1478 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
1483 mflo ($t_1,$a_1,$b_7)
1484 mfhi ($t_2,$a_1,$b_7)
1487 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
1492 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1494 mflo ($t_1,$a_2,$b_7)
1495 mfhi ($t_2,$a_2,$b_7)
1498 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
1502 mflo ($t_1,$a_3,$b_6)
1503 mfhi ($t_2,$a_3,$b_6)
1506 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
1511 mflo ($t_1,$a_4,$b_5)
1512 mfhi ($t_2,$a_4,$b_5)
1515 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
1520 mflo ($t_1,$a_5,$b_4)
1521 mfhi ($t_2,$a_5,$b_4)
1524 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
1529 mflo ($t_1,$a_6,$b_3)
1530 mfhi ($t_2,$a_6,$b_3)
1533 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
1538 mflo ($t_1,$a_7,$b_2)
1539 mfhi ($t_2,$a_7,$b_2)
1542 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
1547 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1549 mflo ($t_1,$a_7,$b_3)
1550 mfhi ($t_2,$a_7,$b_3)
1553 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
1557 mflo ($t_1,$a_6,$b_4)
1558 mfhi ($t_2,$a_6,$b_4)
1561 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
1566 mflo ($t_1,$a_5,$b_5)
1567 mfhi ($t_2,$a_5,$b_5)
1570 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
1575 mflo ($t_1,$a_4,$b_6)
1576 mfhi ($t_2,$a_4,$b_6)
1579 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
1584 mflo ($t_1,$a_3,$b_7)
1585 mfhi ($t_2,$a_3,$b_7)
1588 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
1593 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1595 mflo ($t_1,$a_4,$b_7)
1596 mfhi ($t_2,$a_4,$b_7)
1599 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
1603 mflo ($t_1,$a_5,$b_6)
1604 mfhi ($t_2,$a_5,$b_6)
1607 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
1612 mflo ($t_1,$a_6,$b_5)
1613 mfhi ($t_2,$a_6,$b_5)
1616 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
1621 mflo ($t_1,$a_7,$b_4)
1622 mfhi ($t_2,$a_7,$b_4)
1625 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
1630 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1632 mflo ($t_1,$a_7,$b_5)
1633 mfhi ($t_2,$a_7,$b_5)
1636 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
1640 mflo ($t_1,$a_6,$b_6)
1641 mfhi ($t_2,$a_6,$b_6)
1644 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
1649 mflo ($t_1,$a_5,$b_7)
1650 mfhi ($t_2,$a_5,$b_7)
1653 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
1658 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1660 mflo ($t_1,$a_6,$b_7)
1661 mfhi ($t_2,$a_6,$b_7)
1664 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
1668 mflo ($t_1,$a_7,$b_6)
1669 mfhi ($t_2,$a_7,$b_6)
1672 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
1677 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1679 mflo ($t_1,$a_7,$b_7)
1680 mfhi ($t_2,$a_7,$b_7)
1685 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1686 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1690 $code.=<<___ if ($flavour =~ /nubi/i);
1691 $REG_L $s5,10*$SZREG($sp)
1692 $REG_L $s4,9*$SZREG($sp)
1693 $REG_L $s3,8*$SZREG($sp)
1694 $REG_L $s2,7*$SZREG($sp)
1695 $REG_L $s1,6*$SZREG($sp)
1696 $REG_L $s0,5*$SZREG($sp)
1697 $REG_L $t3,4*$SZREG($sp)
1698 $REG_L $t2,3*$SZREG($sp)
1699 $REG_L $t1,2*$SZREG($sp)
1700 $REG_L $t0,1*$SZREG($sp)
1701 $REG_L $gp,0*$SZREG($sp)
1703 $PTR_ADD $sp,12*$SZREG
1705 $code.=<<___ if ($flavour !~ /nubi/i);
1706 $REG_L $s5,5*$SZREG($sp)
1707 $REG_L $s4,4*$SZREG($sp)
1708 $REG_L $s3,3*$SZREG($sp)
1709 $REG_L $s2,2*$SZREG($sp)
1710 $REG_L $s1,1*$SZREG($sp)
1711 $REG_L $s0,0*$SZREG($sp)
1713 $PTR_ADD $sp,6*$SZREG
1719 .globl bn_mul_comba4
1723 $code.=<<___ if ($flavour =~ /nubi/i);
1724 .frame $sp,6*$SZREG,$ra
1725 .mask 0x8000f008,-$SZREG
1727 $PTR_SUB $sp,6*$SZREG
1728 $REG_S $ra,5*$SZREG($sp)
1729 $REG_S $t3,4*$SZREG($sp)
1730 $REG_S $t2,3*$SZREG($sp)
1731 $REG_S $t1,2*$SZREG($sp)
1732 $REG_S $t0,1*$SZREG($sp)
1733 $REG_S $gp,0*$SZREG($sp)
1740 $LD $a_2,2*$BNSZ($a1)
1741 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1742 $LD $a_3,3*$BNSZ($a1)
1744 $LD $b_2,2*$BNSZ($a2)
1745 $LD $b_3,3*$BNSZ($a2)
1746 mflo ($c_1,$a_0,$b_0)
1747 mfhi ($c_2,$a_0,$b_0)
1750 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1751 mflo ($t_1,$a_0,$b_1)
1752 mfhi ($t_2,$a_0,$b_1)
1755 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1757 mflo ($t_1,$a_1,$b_0)
1758 mfhi ($t_2,$a_1,$b_0)
1761 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1767 mflo ($t_1,$a_2,$b_0)
1768 mfhi ($t_2,$a_2,$b_0)
1771 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1774 mflo ($t_1,$a_1,$b_1)
1775 mfhi ($t_2,$a_1,$b_1)
1778 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1782 mflo ($t_1,$a_0,$b_2)
1783 mfhi ($t_2,$a_0,$b_2)
1786 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1791 $ST $c_3,2*$BNSZ($a0)
1793 mflo ($t_1,$a_0,$b_3)
1794 mfhi ($t_2,$a_0,$b_3)
1797 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1801 mflo ($t_1,$a_1,$b_2)
1802 mfhi ($t_2,$a_1,$b_2)
1805 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1810 mflo ($t_1,$a_2,$b_1)
1811 mfhi ($t_2,$a_2,$b_1)
1814 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1819 mflo ($t_1,$a_3,$b_0)
1820 mfhi ($t_2,$a_3,$b_0)
1823 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1828 $ST $c_1,3*$BNSZ($a0)
1830 mflo ($t_1,$a_3,$b_1)
1831 mfhi ($t_2,$a_3,$b_1)
1834 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1838 mflo ($t_1,$a_2,$b_2)
1839 mfhi ($t_2,$a_2,$b_2)
1842 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1847 mflo ($t_1,$a_1,$b_3)
1848 mfhi ($t_2,$a_1,$b_3)
1851 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1856 $ST $c_2,4*$BNSZ($a0)
1858 mflo ($t_1,$a_2,$b_3)
1859 mfhi ($t_2,$a_2,$b_3)
1862 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1866 mflo ($t_1,$a_3,$b_2)
1867 mfhi ($t_2,$a_3,$b_2)
1870 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1875 $ST $c_3,5*$BNSZ($a0)
1877 mflo ($t_1,$a_3,$b_3)
1878 mfhi ($t_2,$a_3,$b_3)
1883 $ST $c_1,6*$BNSZ($a0)
1884 $ST $c_2,7*$BNSZ($a0)
1888 $code.=<<___ if ($flavour =~ /nubi/i);
1889 $REG_L $t3,4*$SZREG($sp)
1890 $REG_L $t2,3*$SZREG($sp)
1891 $REG_L $t1,2*$SZREG($sp)
1892 $REG_L $t0,1*$SZREG($sp)
1893 $REG_L $gp,0*$SZREG($sp)
1894 $PTR_ADD $sp,6*$SZREG
1902 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1905 my ($hi,$lo,$c0,$c1,$c2,
1906 $warm, # !$warm denotes first call with specific sequence of
1907 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1908 $an,$bn # these two are arguments for multiplication which
1909 # result is used in *next* step [which is why it's
1910 # commented as "forward multiplication" below];
1915 $MULTU ($an,$bn) # forward multiplication
1922 $code.=<<___ if (!$warm);
1926 $code.=<<___ if ($warm);
1942 .globl bn_sqr_comba8
1946 $code.=<<___ if ($flavour =~ /nubi/i);
1947 .frame $sp,6*$SZREG,$ra
1948 .mask 0x8000f008,-$SZREG
1950 $PTR_SUB $sp,6*$SZREG
1951 $REG_S $ra,5*$SZREG($sp)
1952 $REG_S $t3,4*$SZREG($sp)
1953 $REG_S $t2,3*$SZREG($sp)
1954 $REG_S $t1,2*$SZREG($sp)
1955 $REG_S $t0,1*$SZREG($sp)
1956 $REG_S $gp,0*$SZREG($sp)
1962 $LD $a_2,2*$BNSZ($a1)
1963 $LD $a_3,3*$BNSZ($a1)
1965 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1966 $LD $a_4,4*$BNSZ($a1)
1967 $LD $a_5,5*$BNSZ($a1)
1968 $LD $a_6,6*$BNSZ($a1)
1969 $LD $a_7,7*$BNSZ($a1)
1970 mflo ($c_1,$a_0,$a_0)
1971 mfhi ($c_2,$a_0,$a_0)
1974 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1975 mflo ($t_1,$a_0,$a_1)
1976 mfhi ($t_2,$a_0,$a_1)
1979 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
1987 mflo ($t_1,$a_2,$a_0)
1988 mfhi ($t_2,$a_2,$a_0)
1990 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1991 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1995 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2000 $ST $c_3,2*$BNSZ($a0)
2001 mflo ($t_1,$a_0,$a_3)
2002 mfhi ($t_2,$a_0,$a_3)
2004 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2005 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2006 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2007 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2009 $ST $c_1,3*$BNSZ($a0)
2011 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2012 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2013 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2014 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2018 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
2023 $ST $c_2,4*$BNSZ($a0)
2024 mflo ($t_1,$a_0,$a_5)
2025 mfhi ($t_2,$a_0,$a_5)
2027 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2028 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2029 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2030 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2031 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2032 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2034 $ST $c_3,5*$BNSZ($a0)
2036 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2037 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2038 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2039 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2040 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2041 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2045 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
2050 $ST $c_1,6*$BNSZ($a0)
2051 mflo ($t_1,$a_0,$a_7)
2052 mfhi ($t_2,$a_0,$a_7)
2054 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2055 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2056 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2057 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2058 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2059 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2060 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2061 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2063 $ST $c_2,7*$BNSZ($a0)
2065 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2066 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2067 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2068 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2069 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2070 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2074 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
2079 $ST $c_3,8*$BNSZ($a0)
2080 mflo ($t_1,$a_2,$a_7)
2081 mfhi ($t_2,$a_2,$a_7)
2083 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2084 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2085 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2086 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2087 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2088 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2090 $ST $c_1,9*$BNSZ($a0)
2092 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2093 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2094 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2095 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2099 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
2104 $ST $c_2,10*$BNSZ($a0)
2105 mflo ($t_1,$a_4,$a_7)
2106 mfhi ($t_2,$a_4,$a_7)
2108 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2109 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2110 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2111 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2113 $ST $c_3,11*$BNSZ($a0)
2115 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2116 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2120 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
2125 $ST $c_1,12*$BNSZ($a0)
2126 mflo ($t_1,$a_6,$a_7)
2127 mfhi ($t_2,$a_6,$a_7)
2129 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2130 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2132 $ST $c_2,13*$BNSZ($a0)
2138 $ST $c_3,14*$BNSZ($a0)
2139 $ST $c_1,15*$BNSZ($a0)
2143 $code.=<<___ if ($flavour =~ /nubi/i);
2144 $REG_L $t3,4*$SZREG($sp)
2145 $REG_L $t2,3*$SZREG($sp)
2146 $REG_L $t1,2*$SZREG($sp)
2147 $REG_L $t0,1*$SZREG($sp)
2148 $REG_L $gp,0*$SZREG($sp)
2149 $PTR_ADD $sp,6*$SZREG
2157 .globl bn_sqr_comba4
2161 $code.=<<___ if ($flavour =~ /nubi/i);
2162 .frame $sp,6*$SZREG,$ra
2163 .mask 0x8000f008,-$SZREG
2165 $PTR_SUB $sp,6*$SZREG
2166 $REG_S $ra,5*$SZREG($sp)
2167 $REG_S $t3,4*$SZREG($sp)
2168 $REG_S $t2,3*$SZREG($sp)
2169 $REG_S $t1,2*$SZREG($sp)
2170 $REG_S $t0,1*$SZREG($sp)
2171 $REG_S $gp,0*$SZREG($sp)
2177 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
2178 $LD $a_2,2*$BNSZ($a1)
2179 $LD $a_3,3*$BNSZ($a1)
2180 mflo ($c_1,$a_0,$a_0)
2181 mfhi ($c_2,$a_0,$a_0)
2184 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2185 mflo ($t_1,$a_0,$a_1)
2186 mfhi ($t_2,$a_0,$a_1)
2189 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
2197 mflo ($t_1,$a_2,$a_0)
2198 mfhi ($t_2,$a_2,$a_0)
2200 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2201 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2205 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2210 $ST $c_3,2*$BNSZ($a0)
2211 mflo ($t_1,$a_0,$a_3)
2212 mfhi ($t_2,$a_0,$a_3)
2214 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2215 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2216 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2217 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2219 $ST $c_1,3*$BNSZ($a0)
2221 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2222 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2226 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
2231 $ST $c_2,4*$BNSZ($a0)
2232 mflo ($t_1,$a_2,$a_3)
2233 mfhi ($t_2,$a_2,$a_3)
2235 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2236 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2238 $ST $c_3,5*$BNSZ($a0)
2244 $ST $c_1,6*$BNSZ($a0)
2245 $ST $c_2,7*$BNSZ($a0)
2249 $code.=<<___ if ($flavour =~ /nubi/i);
2250 $REG_L $t3,4*$SZREG($sp)
2251 $REG_L $t2,3*$SZREG($sp)
2252 $REG_L $t1,2*$SZREG($sp)
2253 $REG_L $t0,1*$SZREG($sp)
2254 $REG_L $gp,0*$SZREG($sp)
2255 $PTR_ADD $sp,6*$SZREG