2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # Rights for redistribution and usage in source and binary forms are
15 # granted according to the License. Warranty of any kind is disclaimed.
16 # ====================================================================
21 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
23 # The module is designed to work with either of the "new" MIPS ABI(5),
24 # namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
25 # IRIX 5.x not only because it doesn't support new ABIs but also
26 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
27 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
28 # cause illegal instruction exception:-(
30 # In addition the code depends on preprocessor flags set up by MIPSpro
31 # compiler driver (either as or cc) and therefore (probably?) can't be
32 # compiled by the GNU assembler. GNU C driver manages fine though...
33 # I mean as long as -mmips-as is specified or is the default option,
34 # because then it simply invokes /usr/bin/as which in turn takes
35 # perfect care of the preprocessor definitions. Another neat feature
36 # offered by the MIPSpro assembler is an optimization pass. This gave
37 # me the opportunity to have the code looking more regular as all those
38 # architecture dependent instruction rescheduling details were left to
39 # the assembler. Cool, huh?
41 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
42 # goes way over 3 times faster!
48 # Adapt the module even for 32-bit ABIs and other OSes. The former was
49 # achieved by mechanical replacement of 64-bit arithmetic instructions
50 # such as dmultu, daddu, etc. with their 32-bit counterparts and
51 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
52 # >3x performance improvement naturally does not apply to 32-bit code
53 # [because there is no instruction 32-bit compiler can't use], one
54 # has to content with 40-85% improvement depending on benchmark and
55 # key length, more for longer keys.
57 # $output is the last argument if it looks like a file (it has an extension)
58 # $flavour is the first argument if it doesn't look like a file
59 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
60 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
62 if ($flavour =~ /64|n32/i) {
92 $code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n";
95 $output and open STDOUT,">$output";
97 # Below is N32/64 register layout used in the original module.
99 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
100 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
101 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
102 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
103 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
104 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
106 # No special adaptation is required for O32. NUBI on the other hand
107 # is treated by saving/restoring ($v1,$t0..$t3).
109 $gp=$v1 if ($flavour =~ /nubi/i);
114 #include "mips_arch.h"
116 #if defined(_MIPS_ARCH_MIPS64R6)
117 # define ddivu(rs,rt)
118 # define mfqt(rd,rs,rt) ddivu rd,rs,rt
119 # define mfrm(rd,rs,rt) dmodu rd,rs,rt
120 #elif defined(_MIPS_ARCH_MIPS32R6)
122 # define mfqt(rd,rs,rt) divu rd,rs,rt
123 # define mfrm(rd,rs,rt) modu rd,rs,rt
125 # define $DIVU(rs,rt) $DIVU $zero,rs,rt
126 # define mfqt(rd,rs,rt) mflo rd
127 # define mfrm(rd,rs,rt) mfhi rd
131 .asciiz "mips3.s, Version 1.2"
132 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
138 .globl bn_mul_add_words
139 .ent bn_mul_add_words
142 bgtz $a2,bn_mul_add_words_internal
146 .end bn_mul_add_words
149 .ent bn_mul_add_words_internal
150 bn_mul_add_words_internal:
152 $code.=<<___ if ($flavour =~ /nubi/i);
153 .frame $sp,6*$SZREG,$ra
154 .mask 0x8000f008,-$SZREG
156 $PTR_SUB $sp,6*$SZREG
157 $REG_S $ra,5*$SZREG($sp)
158 $REG_S $t3,4*$SZREG($sp)
159 $REG_S $t2,3*$SZREG($sp)
160 $REG_S $t1,2*$SZREG($sp)
161 $REG_S $t0,1*$SZREG($sp)
162 $REG_S $gp,0*$SZREG($sp)
168 beqz $ta0,.L_bn_mul_add_words_tail
170 .L_bn_mul_add_words_loop:
176 $LD $ta0,2*$BNSZ($a1)
177 $LD $ta1,2*$BNSZ($a0)
179 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
180 # values", but it seems to work fine
181 # even on 64-bit registers.
191 $LD $ta2,3*$BNSZ($a1)
192 $LD $ta3,3*$BNSZ($a0)
215 $ST $ta1,-2*$BNSZ($a0)
229 bgtz $ta0,.L_bn_mul_add_words_loop
232 beqz $a2,.L_bn_mul_add_words_return
235 .L_bn_mul_add_words_tail:
250 beqz $a2,.L_bn_mul_add_words_return
265 beqz $a2,.L_bn_mul_add_words_return
280 .L_bn_mul_add_words_return:
283 $code.=<<___ if ($flavour =~ /nubi/i);
284 $REG_L $t3,4*$SZREG($sp)
285 $REG_L $t2,3*$SZREG($sp)
286 $REG_L $t1,2*$SZREG($sp)
287 $REG_L $t0,1*$SZREG($sp)
288 $REG_L $gp,0*$SZREG($sp)
289 $PTR_ADD $sp,6*$SZREG
294 .end bn_mul_add_words_internal
301 bgtz $a2,bn_mul_words_internal
308 .ent bn_mul_words_internal
309 bn_mul_words_internal:
311 $code.=<<___ if ($flavour =~ /nubi/i);
312 .frame $sp,6*$SZREG,$ra
313 .mask 0x8000f008,-$SZREG
315 $PTR_SUB $sp,6*$SZREG
316 $REG_S $ra,5*$SZREG($sp)
317 $REG_S $t3,4*$SZREG($sp)
318 $REG_S $t2,3*$SZREG($sp)
319 $REG_S $t1,2*$SZREG($sp)
320 $REG_S $t0,1*$SZREG($sp)
321 $REG_S $gp,0*$SZREG($sp)
327 beqz $ta0,.L_bn_mul_words_tail
329 .L_bn_mul_words_loop:
333 $LD $ta0,2*$BNSZ($a1)
334 $LD $ta2,3*$BNSZ($a1)
351 $ST $v0,-3*$BNSZ($a0)
359 $ST $v0,-2*$BNSZ($a0)
369 bgtz $ta0,.L_bn_mul_words_loop
372 beqz $a2,.L_bn_mul_words_return
375 .L_bn_mul_words_tail:
386 beqz $a2,.L_bn_mul_words_return
397 beqz $a2,.L_bn_mul_words_return
408 .L_bn_mul_words_return:
411 $code.=<<___ if ($flavour =~ /nubi/i);
412 $REG_L $t3,4*$SZREG($sp)
413 $REG_L $t2,3*$SZREG($sp)
414 $REG_L $t1,2*$SZREG($sp)
415 $REG_L $t0,1*$SZREG($sp)
416 $REG_L $gp,0*$SZREG($sp)
417 $PTR_ADD $sp,6*$SZREG
422 .end bn_mul_words_internal
429 bgtz $a2,bn_sqr_words_internal
436 .ent bn_sqr_words_internal
437 bn_sqr_words_internal:
439 $code.=<<___ if ($flavour =~ /nubi/i);
440 .frame $sp,6*$SZREG,$ra
441 .mask 0x8000f008,-$SZREG
443 $PTR_SUB $sp,6*$SZREG
444 $REG_S $ra,5*$SZREG($sp)
445 $REG_S $t3,4*$SZREG($sp)
446 $REG_S $t2,3*$SZREG($sp)
447 $REG_S $t1,2*$SZREG($sp)
448 $REG_S $t0,1*$SZREG($sp)
449 $REG_S $gp,0*$SZREG($sp)
455 beqz $ta0,.L_bn_sqr_words_tail
457 .L_bn_sqr_words_loop:
461 $LD $ta0,2*$BNSZ($a1)
462 $LD $ta2,3*$BNSZ($a1)
474 $ST $t3,-6*$BNSZ($a0)
475 $ST $t2,-5*$BNSZ($a0)
478 mflo ($ta1,$ta0,$ta0)
479 mfhi ($ta0,$ta0,$ta0)
480 $ST $ta1,-4*$BNSZ($a0)
481 $ST $ta0,-3*$BNSZ($a0)
486 mflo ($ta3,$ta2,$ta2)
487 mfhi ($ta2,$ta2,$ta2)
488 $ST $ta3,-2*$BNSZ($a0)
491 bgtz $ta0,.L_bn_sqr_words_loop
494 beqz $a2,.L_bn_sqr_words_return
497 .L_bn_sqr_words_tail:
506 beqz $a2,.L_bn_sqr_words_return
515 beqz $a2,.L_bn_sqr_words_return
524 .L_bn_sqr_words_return:
527 $code.=<<___ if ($flavour =~ /nubi/i);
528 $REG_L $t3,4*$SZREG($sp)
529 $REG_L $t2,3*$SZREG($sp)
530 $REG_L $t1,2*$SZREG($sp)
531 $REG_L $t0,1*$SZREG($sp)
532 $REG_L $gp,0*$SZREG($sp)
533 $PTR_ADD $sp,6*$SZREG
539 .end bn_sqr_words_internal
546 bgtz $a3,bn_add_words_internal
553 .ent bn_add_words_internal
554 bn_add_words_internal:
556 $code.=<<___ if ($flavour =~ /nubi/i);
557 .frame $sp,6*$SZREG,$ra
558 .mask 0x8000f008,-$SZREG
560 $PTR_SUB $sp,6*$SZREG
561 $REG_S $ra,5*$SZREG($sp)
562 $REG_S $t3,4*$SZREG($sp)
563 $REG_S $t2,3*$SZREG($sp)
564 $REG_S $t1,2*$SZREG($sp)
565 $REG_S $t0,1*$SZREG($sp)
566 $REG_S $gp,0*$SZREG($sp)
572 beqz $at,.L_bn_add_words_tail
574 .L_bn_add_words_loop:
584 $LD $ta1,-3*$BNSZ($a2)
586 $LD $ta2,-2*$BNSZ($a2)
592 $ST $t0,-4*$BNSZ($a0)
599 $ST $t1,-3*$BNSZ($a0)
606 $ST $t2,-2*$BNSZ($a0)
616 bgtz $at,.L_bn_add_words_loop
619 beqz $a3,.L_bn_add_words_return
622 .L_bn_add_words_tail:
633 beqz $a3,.L_bn_add_words_return
644 beqz $a3,.L_bn_add_words_return
647 $LD $ta2,2*$BNSZ($a2)
655 .L_bn_add_words_return:
658 $code.=<<___ if ($flavour =~ /nubi/i);
659 $REG_L $t3,4*$SZREG($sp)
660 $REG_L $t2,3*$SZREG($sp)
661 $REG_L $t1,2*$SZREG($sp)
662 $REG_L $t0,1*$SZREG($sp)
663 $REG_L $gp,0*$SZREG($sp)
664 $PTR_ADD $sp,6*$SZREG
670 .end bn_add_words_internal
677 bgtz $a3,bn_sub_words_internal
684 .ent bn_sub_words_internal
685 bn_sub_words_internal:
687 $code.=<<___ if ($flavour =~ /nubi/i);
688 .frame $sp,6*$SZREG,$ra
689 .mask 0x8000f008,-$SZREG
691 $PTR_SUB $sp,6*$SZREG
692 $REG_S $ra,5*$SZREG($sp)
693 $REG_S $t3,4*$SZREG($sp)
694 $REG_S $t2,3*$SZREG($sp)
695 $REG_S $t1,2*$SZREG($sp)
696 $REG_S $t0,1*$SZREG($sp)
697 $REG_S $gp,0*$SZREG($sp)
703 beqz $at,.L_bn_sub_words_tail
705 .L_bn_sub_words_loop:
715 $LD $ta1,-3*$BNSZ($a2)
717 $LD $ta2,-2*$BNSZ($a2)
723 $ST $t0,-4*$BNSZ($a0)
730 $ST $t1,-3*$BNSZ($a0)
738 $ST $t2,-2*$BNSZ($a0)
748 bgtz $at,.L_bn_sub_words_loop
751 beqz $a3,.L_bn_sub_words_return
754 .L_bn_sub_words_tail:
765 beqz $a3,.L_bn_sub_words_return
776 beqz $a3,.L_bn_sub_words_return
779 $LD $ta2,2*$BNSZ($a2)
787 .L_bn_sub_words_return:
790 $code.=<<___ if ($flavour =~ /nubi/i);
791 $REG_L $t3,4*$SZREG($sp)
792 $REG_L $t2,3*$SZREG($sp)
793 $REG_L $t1,2*$SZREG($sp)
794 $REG_L $t0,1*$SZREG($sp)
795 $REG_L $gp,0*$SZREG($sp)
796 $PTR_ADD $sp,6*$SZREG
801 .end bn_sub_words_internal
805 * The bn_div_3_words entry point is re-used for constant-time interface.
806 * Implementation is retained as historical reference.
809 .globl bn_div_3_words
813 move $a3,$a0 # we know that bn_div_words does not
814 # touch $a3, $ta2, $ta3 and preserves $a2
815 # so that we can save two arguments
816 # and return address in registers
817 # instead of stack:-)
821 bne $a0,$a2,bn_div_3_words_internal
829 .ent bn_div_3_words_internal
830 bn_div_3_words_internal:
832 $code.=<<___ if ($flavour =~ /nubi/i);
833 .frame $sp,6*$SZREG,$ra
834 .mask 0x8000f008,-$SZREG
836 $PTR_SUB $sp,6*$SZREG
837 $REG_S $ra,5*$SZREG($sp)
838 $REG_S $t3,4*$SZREG($sp)
839 $REG_S $t2,3*$SZREG($sp)
840 $REG_S $t1,2*$SZREG($sp)
841 $REG_S $t0,1*$SZREG($sp)
842 $REG_S $gp,0*$SZREG($sp)
847 bal bn_div_words_internal
850 $LD $t2,-2*$BNSZ($a3)
855 .L_bn_div_3_words_inner_loop:
856 bnez $t8,.L_bn_div_3_words_inner_loop_done
868 beqz $at,.L_bn_div_3_words_inner_loop
872 .L_bn_div_3_words_inner_loop_done:
875 $code.=<<___ if ($flavour =~ /nubi/i);
876 $REG_L $t3,4*$SZREG($sp)
877 $REG_L $t2,3*$SZREG($sp)
878 $REG_L $t1,2*$SZREG($sp)
879 $REG_L $t0,1*$SZREG($sp)
880 $REG_L $gp,0*$SZREG($sp)
881 $PTR_ADD $sp,6*$SZREG
886 .end bn_div_3_words_internal
894 bnez $a2,bn_div_words_internal
895 li $v0,-1 # I would rather signal div-by-zero
896 # which can be done with 'break 7'
902 .ent bn_div_words_internal
903 bn_div_words_internal:
905 $code.=<<___ if ($flavour =~ /nubi/i);
906 .frame $sp,6*$SZREG,$ra
907 .mask 0x8000f008,-$SZREG
909 $PTR_SUB $sp,6*$SZREG
910 $REG_S $ra,5*$SZREG($sp)
911 $REG_S $t3,4*$SZREG($sp)
912 $REG_S $t2,3*$SZREG($sp)
913 $REG_S $t1,2*$SZREG($sp)
914 $REG_S $t0,1*$SZREG($sp)
915 $REG_S $gp,0*$SZREG($sp)
919 bltz $a2,.L_bn_div_words_body
934 break 6 # signal overflow
944 .L_bn_div_words_body:
945 $SRL $DH,$a2,4*$BNSZ # bits
954 $SRL $HH,$a0,4*$BNSZ # bits
955 $SRL $QT,4*$BNSZ # q=0xffffffff
956 beq $DH,$HH,.L_bn_div_words_skip_div1
959 .L_bn_div_words_skip_div1:
961 $SLL $t3,$a0,4*$BNSZ # bits
962 $SRL $at,$a1,4*$BNSZ # bits
966 .L_bn_div_words_inner_loop1:
974 beqz $at,.L_bn_div_words_inner_loop1_done
977 b .L_bn_div_words_inner_loop1
980 .L_bn_div_words_inner_loop1_done:
982 $SLL $a1,4*$BNSZ # bits
984 $SLL $v0,$QT,4*$BNSZ # bits
987 $SRL $HH,$a0,4*$BNSZ # bits
988 $SRL $QT,4*$BNSZ # q=0xffffffff
989 beq $DH,$HH,.L_bn_div_words_skip_div2
992 .L_bn_div_words_skip_div2:
994 $SLL $t3,$a0,4*$BNSZ # bits
995 $SRL $at,$a1,4*$BNSZ # bits
999 .L_bn_div_words_inner_loop2:
1007 beqz $at,.L_bn_div_words_inner_loop2_done
1010 b .L_bn_div_words_inner_loop2
1013 .L_bn_div_words_inner_loop2_done:
1017 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1018 $SRL $a2,$t9 # restore $a2
1023 $code.=<<___ if ($flavour =~ /nubi/i);
1024 $REG_L $t3,4*$SZREG($sp)
1025 $REG_L $t2,3*$SZREG($sp)
1026 $REG_L $t1,2*$SZREG($sp)
1027 $REG_L $t0,1*$SZREG($sp)
1028 $REG_L $gp,0*$SZREG($sp)
1029 $PTR_ADD $sp,6*$SZREG
1034 .end bn_div_words_internal
1036 undef $HH; undef $QT; undef $DH;
1038 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1039 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1041 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1042 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1044 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1049 .globl bn_mul_comba8
1054 $code.=<<___ if ($flavour =~ /nubi/i);
1055 .frame $sp,12*$SZREG,$ra
1056 .mask 0x803ff008,-$SZREG
1057 $PTR_SUB $sp,12*$SZREG
1058 $REG_S $ra,11*$SZREG($sp)
1059 $REG_S $s5,10*$SZREG($sp)
1060 $REG_S $s4,9*$SZREG($sp)
1061 $REG_S $s3,8*$SZREG($sp)
1062 $REG_S $s2,7*$SZREG($sp)
1063 $REG_S $s1,6*$SZREG($sp)
1064 $REG_S $s0,5*$SZREG($sp)
1065 $REG_S $t3,4*$SZREG($sp)
1066 $REG_S $t2,3*$SZREG($sp)
1067 $REG_S $t1,2*$SZREG($sp)
1068 $REG_S $t0,1*$SZREG($sp)
1069 $REG_S $gp,0*$SZREG($sp)
1071 $code.=<<___ if ($flavour !~ /nubi/i);
1072 .frame $sp,6*$SZREG,$ra
1073 .mask 0x003f0000,-$SZREG
1074 $PTR_SUB $sp,6*$SZREG
1075 $REG_S $s5,5*$SZREG($sp)
1076 $REG_S $s4,4*$SZREG($sp)
1077 $REG_S $s3,3*$SZREG($sp)
1078 $REG_S $s2,2*$SZREG($sp)
1079 $REG_S $s1,1*$SZREG($sp)
1080 $REG_S $s0,0*$SZREG($sp)
1085 $LD $a_0,0($a1) # If compiled with -mips3 option on
1086 # R5000 box assembler barks on this
1087 # 1ine with "should not have mult/div
1088 # as last instruction in bb (R10K
1089 # bug)" warning. If anybody out there
1090 # has a clue about how to circumvent
1091 # this do send me a note.
1092 # <appro\@fy.chalmers.se>
1096 $LD $a_2,2*$BNSZ($a1)
1097 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1098 $LD $a_3,3*$BNSZ($a1)
1100 $LD $b_2,2*$BNSZ($a2)
1101 $LD $b_3,3*$BNSZ($a2)
1102 mflo ($c_1,$a_0,$b_0)
1103 mfhi ($c_2,$a_0,$b_0)
1105 $LD $a_4,4*$BNSZ($a1)
1106 $LD $a_5,5*$BNSZ($a1)
1107 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1108 $LD $a_6,6*$BNSZ($a1)
1109 $LD $a_7,7*$BNSZ($a1)
1110 $LD $b_4,4*$BNSZ($a2)
1111 $LD $b_5,5*$BNSZ($a2)
1112 mflo ($t_1,$a_0,$b_1)
1113 mfhi ($t_2,$a_0,$b_1)
1116 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1118 $LD $b_6,6*$BNSZ($a2)
1119 $LD $b_7,7*$BNSZ($a2)
1120 $ST $c_1,0($a0) # r[0]=c1;
1121 mflo ($t_1,$a_1,$b_0)
1122 mfhi ($t_2,$a_1,$b_0)
1125 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1129 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1131 mflo ($t_1,$a_2,$b_0)
1132 mfhi ($t_2,$a_2,$b_0)
1135 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1138 mflo ($t_1,$a_1,$b_1)
1139 mfhi ($t_2,$a_1,$b_1)
1142 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1146 mflo ($t_1,$a_0,$b_2)
1147 mfhi ($t_2,$a_0,$b_2)
1150 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1155 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1157 mflo ($t_1,$a_0,$b_3)
1158 mfhi ($t_2,$a_0,$b_3)
1161 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1165 mflo ($t_1,$a_1,$b_2)
1166 mfhi ($t_2,$a_1,$b_2)
1169 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1174 mflo ($t_1,$a_2,$b_1)
1175 mfhi ($t_2,$a_2,$b_1)
1178 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1183 mflo ($t_1,$a_3,$b_0)
1184 mfhi ($t_2,$a_3,$b_0)
1187 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
1192 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1194 mflo ($t_1,$a_4,$b_0)
1195 mfhi ($t_2,$a_4,$b_0)
1198 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1202 mflo ($t_1,$a_3,$b_1)
1203 mfhi ($t_2,$a_3,$b_1)
1206 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1211 mflo ($t_1,$a_2,$b_2)
1212 mfhi ($t_2,$a_2,$b_2)
1215 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1220 mflo ($t_1,$a_1,$b_3)
1221 mfhi ($t_2,$a_1,$b_3)
1224 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
1229 mflo ($t_1,$a_0,$b_4)
1230 mfhi ($t_2,$a_0,$b_4)
1233 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
1238 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1240 mflo ($t_1,$a_0,$b_5)
1241 mfhi ($t_2,$a_0,$b_5)
1244 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
1248 mflo ($t_1,$a_1,$b_4)
1249 mfhi ($t_2,$a_1,$b_4)
1252 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1257 mflo ($t_1,$a_2,$b_3)
1258 mfhi ($t_2,$a_2,$b_3)
1261 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1266 mflo ($t_1,$a_3,$b_2)
1267 mfhi ($t_2,$a_3,$b_2)
1270 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
1275 mflo ($t_1,$a_4,$b_1)
1276 mfhi ($t_2,$a_4,$b_1)
1279 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
1284 mflo ($t_1,$a_5,$b_0)
1285 mfhi ($t_2,$a_5,$b_0)
1288 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
1293 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1295 mflo ($t_1,$a_6,$b_0)
1296 mfhi ($t_2,$a_6,$b_0)
1299 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
1303 mflo ($t_1,$a_5,$b_1)
1304 mfhi ($t_2,$a_5,$b_1)
1307 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
1312 mflo ($t_1,$a_4,$b_2)
1313 mfhi ($t_2,$a_4,$b_2)
1316 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1321 mflo ($t_1,$a_3,$b_3)
1322 mfhi ($t_2,$a_3,$b_3)
1325 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
1330 mflo ($t_1,$a_2,$b_4)
1331 mfhi ($t_2,$a_2,$b_4)
1334 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
1339 mflo ($t_1,$a_1,$b_5)
1340 mfhi ($t_2,$a_1,$b_5)
1343 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
1348 mflo ($t_1,$a_0,$b_6)
1349 mfhi ($t_2,$a_0,$b_6)
1352 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
1357 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1359 mflo ($t_1,$a_0,$b_7)
1360 mfhi ($t_2,$a_0,$b_7)
1363 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
1367 mflo ($t_1,$a_1,$b_6)
1368 mfhi ($t_2,$a_1,$b_6)
1371 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
1376 mflo ($t_1,$a_2,$b_5)
1377 mfhi ($t_2,$a_2,$b_5)
1380 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
1385 mflo ($t_1,$a_3,$b_4)
1386 mfhi ($t_2,$a_3,$b_4)
1389 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
1394 mflo ($t_1,$a_4,$b_3)
1395 mfhi ($t_2,$a_4,$b_3)
1398 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
1403 mflo ($t_1,$a_5,$b_2)
1404 mfhi ($t_2,$a_5,$b_2)
1407 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
1412 mflo ($t_1,$a_6,$b_1)
1413 mfhi ($t_2,$a_6,$b_1)
1416 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
1421 mflo ($t_1,$a_7,$b_0)
1422 mfhi ($t_2,$a_7,$b_0)
1425 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
1430 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1432 mflo ($t_1,$a_7,$b_1)
1433 mfhi ($t_2,$a_7,$b_1)
1436 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
1440 mflo ($t_1,$a_6,$b_2)
1441 mfhi ($t_2,$a_6,$b_2)
1444 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
1449 mflo ($t_1,$a_5,$b_3)
1450 mfhi ($t_2,$a_5,$b_3)
1453 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
1458 mflo ($t_1,$a_4,$b_4)
1459 mfhi ($t_2,$a_4,$b_4)
1462 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
1467 mflo ($t_1,$a_3,$b_5)
1468 mfhi ($t_2,$a_3,$b_5)
1471 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
1476 mflo ($t_1,$a_2,$b_6)
1477 mfhi ($t_2,$a_2,$b_6)
1480 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
1485 mflo ($t_1,$a_1,$b_7)
1486 mfhi ($t_2,$a_1,$b_7)
1489 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
1494 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1496 mflo ($t_1,$a_2,$b_7)
1497 mfhi ($t_2,$a_2,$b_7)
1500 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
1504 mflo ($t_1,$a_3,$b_6)
1505 mfhi ($t_2,$a_3,$b_6)
1508 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
1513 mflo ($t_1,$a_4,$b_5)
1514 mfhi ($t_2,$a_4,$b_5)
1517 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
1522 mflo ($t_1,$a_5,$b_4)
1523 mfhi ($t_2,$a_5,$b_4)
1526 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
1531 mflo ($t_1,$a_6,$b_3)
1532 mfhi ($t_2,$a_6,$b_3)
1535 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
1540 mflo ($t_1,$a_7,$b_2)
1541 mfhi ($t_2,$a_7,$b_2)
1544 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
1549 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1551 mflo ($t_1,$a_7,$b_3)
1552 mfhi ($t_2,$a_7,$b_3)
1555 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
1559 mflo ($t_1,$a_6,$b_4)
1560 mfhi ($t_2,$a_6,$b_4)
1563 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
1568 mflo ($t_1,$a_5,$b_5)
1569 mfhi ($t_2,$a_5,$b_5)
1572 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
1577 mflo ($t_1,$a_4,$b_6)
1578 mfhi ($t_2,$a_4,$b_6)
1581 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
1586 mflo ($t_1,$a_3,$b_7)
1587 mfhi ($t_2,$a_3,$b_7)
1590 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
1595 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1597 mflo ($t_1,$a_4,$b_7)
1598 mfhi ($t_2,$a_4,$b_7)
1601 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
1605 mflo ($t_1,$a_5,$b_6)
1606 mfhi ($t_2,$a_5,$b_6)
1609 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
1614 mflo ($t_1,$a_6,$b_5)
1615 mfhi ($t_2,$a_6,$b_5)
1618 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
1623 mflo ($t_1,$a_7,$b_4)
1624 mfhi ($t_2,$a_7,$b_4)
1627 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
1632 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1634 mflo ($t_1,$a_7,$b_5)
1635 mfhi ($t_2,$a_7,$b_5)
1638 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
1642 mflo ($t_1,$a_6,$b_6)
1643 mfhi ($t_2,$a_6,$b_6)
1646 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
1651 mflo ($t_1,$a_5,$b_7)
1652 mfhi ($t_2,$a_5,$b_7)
1655 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
1660 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1662 mflo ($t_1,$a_6,$b_7)
1663 mfhi ($t_2,$a_6,$b_7)
1666 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
1670 mflo ($t_1,$a_7,$b_6)
1671 mfhi ($t_2,$a_7,$b_6)
1674 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
1679 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1681 mflo ($t_1,$a_7,$b_7)
1682 mfhi ($t_2,$a_7,$b_7)
1687 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1688 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1692 $code.=<<___ if ($flavour =~ /nubi/i);
1693 $REG_L $s5,10*$SZREG($sp)
1694 $REG_L $s4,9*$SZREG($sp)
1695 $REG_L $s3,8*$SZREG($sp)
1696 $REG_L $s2,7*$SZREG($sp)
1697 $REG_L $s1,6*$SZREG($sp)
1698 $REG_L $s0,5*$SZREG($sp)
1699 $REG_L $t3,4*$SZREG($sp)
1700 $REG_L $t2,3*$SZREG($sp)
1701 $REG_L $t1,2*$SZREG($sp)
1702 $REG_L $t0,1*$SZREG($sp)
1703 $REG_L $gp,0*$SZREG($sp)
1705 $PTR_ADD $sp,12*$SZREG
1707 $code.=<<___ if ($flavour !~ /nubi/i);
1708 $REG_L $s5,5*$SZREG($sp)
1709 $REG_L $s4,4*$SZREG($sp)
1710 $REG_L $s3,3*$SZREG($sp)
1711 $REG_L $s2,2*$SZREG($sp)
1712 $REG_L $s1,1*$SZREG($sp)
1713 $REG_L $s0,0*$SZREG($sp)
1715 $PTR_ADD $sp,6*$SZREG
1721 .globl bn_mul_comba4
1725 $code.=<<___ if ($flavour =~ /nubi/i);
1726 .frame $sp,6*$SZREG,$ra
1727 .mask 0x8000f008,-$SZREG
1729 $PTR_SUB $sp,6*$SZREG
1730 $REG_S $ra,5*$SZREG($sp)
1731 $REG_S $t3,4*$SZREG($sp)
1732 $REG_S $t2,3*$SZREG($sp)
1733 $REG_S $t1,2*$SZREG($sp)
1734 $REG_S $t0,1*$SZREG($sp)
1735 $REG_S $gp,0*$SZREG($sp)
1742 $LD $a_2,2*$BNSZ($a1)
1743 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1744 $LD $a_3,3*$BNSZ($a1)
1746 $LD $b_2,2*$BNSZ($a2)
1747 $LD $b_3,3*$BNSZ($a2)
1748 mflo ($c_1,$a_0,$b_0)
1749 mfhi ($c_2,$a_0,$b_0)
1752 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1753 mflo ($t_1,$a_0,$b_1)
1754 mfhi ($t_2,$a_0,$b_1)
1757 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1759 mflo ($t_1,$a_1,$b_0)
1760 mfhi ($t_2,$a_1,$b_0)
1763 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1769 mflo ($t_1,$a_2,$b_0)
1770 mfhi ($t_2,$a_2,$b_0)
1773 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1776 mflo ($t_1,$a_1,$b_1)
1777 mfhi ($t_2,$a_1,$b_1)
1780 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1784 mflo ($t_1,$a_0,$b_2)
1785 mfhi ($t_2,$a_0,$b_2)
1788 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1793 $ST $c_3,2*$BNSZ($a0)
1795 mflo ($t_1,$a_0,$b_3)
1796 mfhi ($t_2,$a_0,$b_3)
1799 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1803 mflo ($t_1,$a_1,$b_2)
1804 mfhi ($t_2,$a_1,$b_2)
1807 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1812 mflo ($t_1,$a_2,$b_1)
1813 mfhi ($t_2,$a_2,$b_1)
1816 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1821 mflo ($t_1,$a_3,$b_0)
1822 mfhi ($t_2,$a_3,$b_0)
1825 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1830 $ST $c_1,3*$BNSZ($a0)
1832 mflo ($t_1,$a_3,$b_1)
1833 mfhi ($t_2,$a_3,$b_1)
1836 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1840 mflo ($t_1,$a_2,$b_2)
1841 mfhi ($t_2,$a_2,$b_2)
1844 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1849 mflo ($t_1,$a_1,$b_3)
1850 mfhi ($t_2,$a_1,$b_3)
1853 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1858 $ST $c_2,4*$BNSZ($a0)
1860 mflo ($t_1,$a_2,$b_3)
1861 mfhi ($t_2,$a_2,$b_3)
1864 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1868 mflo ($t_1,$a_3,$b_2)
1869 mfhi ($t_2,$a_3,$b_2)
1872 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1877 $ST $c_3,5*$BNSZ($a0)
1879 mflo ($t_1,$a_3,$b_3)
1880 mfhi ($t_2,$a_3,$b_3)
1885 $ST $c_1,6*$BNSZ($a0)
1886 $ST $c_2,7*$BNSZ($a0)
1890 $code.=<<___ if ($flavour =~ /nubi/i);
1891 $REG_L $t3,4*$SZREG($sp)
1892 $REG_L $t2,3*$SZREG($sp)
1893 $REG_L $t1,2*$SZREG($sp)
1894 $REG_L $t0,1*$SZREG($sp)
1895 $REG_L $gp,0*$SZREG($sp)
1896 $PTR_ADD $sp,6*$SZREG
1904 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1907 my ($hi,$lo,$c0,$c1,$c2,
1908 $warm, # !$warm denotes first call with specific sequence of
1909 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1910 $an,$bn # these two are arguments for multiplication which
1911 # result is used in *next* step [which is why it's
1912 # commented as "forward multiplication" below];
1917 $MULTU ($an,$bn) # forward multiplication
1924 $code.=<<___ if (!$warm);
1928 $code.=<<___ if ($warm);
1944 .globl bn_sqr_comba8
1948 $code.=<<___ if ($flavour =~ /nubi/i);
1949 .frame $sp,6*$SZREG,$ra
1950 .mask 0x8000f008,-$SZREG
1952 $PTR_SUB $sp,6*$SZREG
1953 $REG_S $ra,5*$SZREG($sp)
1954 $REG_S $t3,4*$SZREG($sp)
1955 $REG_S $t2,3*$SZREG($sp)
1956 $REG_S $t1,2*$SZREG($sp)
1957 $REG_S $t0,1*$SZREG($sp)
1958 $REG_S $gp,0*$SZREG($sp)
1964 $LD $a_2,2*$BNSZ($a1)
1965 $LD $a_3,3*$BNSZ($a1)
1967 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1968 $LD $a_4,4*$BNSZ($a1)
1969 $LD $a_5,5*$BNSZ($a1)
1970 $LD $a_6,6*$BNSZ($a1)
1971 $LD $a_7,7*$BNSZ($a1)
1972 mflo ($c_1,$a_0,$a_0)
1973 mfhi ($c_2,$a_0,$a_0)
1976 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1977 mflo ($t_1,$a_0,$a_1)
1978 mfhi ($t_2,$a_0,$a_1)
1981 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
1989 mflo ($t_1,$a_2,$a_0)
1990 mfhi ($t_2,$a_2,$a_0)
1992 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1993 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1997 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2002 $ST $c_3,2*$BNSZ($a0)
2003 mflo ($t_1,$a_0,$a_3)
2004 mfhi ($t_2,$a_0,$a_3)
2006 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2007 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2008 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2009 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2011 $ST $c_1,3*$BNSZ($a0)
2013 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2014 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2015 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2016 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2020 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
2025 $ST $c_2,4*$BNSZ($a0)
2026 mflo ($t_1,$a_0,$a_5)
2027 mfhi ($t_2,$a_0,$a_5)
2029 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2030 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2031 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2032 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2033 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2034 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2036 $ST $c_3,5*$BNSZ($a0)
2038 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2039 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2040 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2041 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2042 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2043 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2047 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
2052 $ST $c_1,6*$BNSZ($a0)
2053 mflo ($t_1,$a_0,$a_7)
2054 mfhi ($t_2,$a_0,$a_7)
2056 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2057 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2058 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2059 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2060 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2061 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2062 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2063 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2065 $ST $c_2,7*$BNSZ($a0)
2067 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2068 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2069 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2070 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2071 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2072 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2076 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
2081 $ST $c_3,8*$BNSZ($a0)
2082 mflo ($t_1,$a_2,$a_7)
2083 mfhi ($t_2,$a_2,$a_7)
2085 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2086 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2087 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2088 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2089 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2090 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2092 $ST $c_1,9*$BNSZ($a0)
2094 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2095 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2096 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2097 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2101 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
2106 $ST $c_2,10*$BNSZ($a0)
2107 mflo ($t_1,$a_4,$a_7)
2108 mfhi ($t_2,$a_4,$a_7)
2110 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2111 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2112 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2113 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2115 $ST $c_3,11*$BNSZ($a0)
2117 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2118 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2122 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
2127 $ST $c_1,12*$BNSZ($a0)
2128 mflo ($t_1,$a_6,$a_7)
2129 mfhi ($t_2,$a_6,$a_7)
2131 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2132 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2134 $ST $c_2,13*$BNSZ($a0)
2140 $ST $c_3,14*$BNSZ($a0)
2141 $ST $c_1,15*$BNSZ($a0)
2145 $code.=<<___ if ($flavour =~ /nubi/i);
2146 $REG_L $t3,4*$SZREG($sp)
2147 $REG_L $t2,3*$SZREG($sp)
2148 $REG_L $t1,2*$SZREG($sp)
2149 $REG_L $t0,1*$SZREG($sp)
2150 $REG_L $gp,0*$SZREG($sp)
2151 $PTR_ADD $sp,6*$SZREG
2159 .globl bn_sqr_comba4
2163 $code.=<<___ if ($flavour =~ /nubi/i);
2164 .frame $sp,6*$SZREG,$ra
2165 .mask 0x8000f008,-$SZREG
2167 $PTR_SUB $sp,6*$SZREG
2168 $REG_S $ra,5*$SZREG($sp)
2169 $REG_S $t3,4*$SZREG($sp)
2170 $REG_S $t2,3*$SZREG($sp)
2171 $REG_S $t1,2*$SZREG($sp)
2172 $REG_S $t0,1*$SZREG($sp)
2173 $REG_S $gp,0*$SZREG($sp)
2179 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
2180 $LD $a_2,2*$BNSZ($a1)
2181 $LD $a_3,3*$BNSZ($a1)
2182 mflo ($c_1,$a_0,$a_0)
2183 mfhi ($c_2,$a_0,$a_0)
2186 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2187 mflo ($t_1,$a_0,$a_1)
2188 mfhi ($t_2,$a_0,$a_1)
2191 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
2199 mflo ($t_1,$a_2,$a_0)
2200 mfhi ($t_2,$a_2,$a_0)
2202 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2203 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2207 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2212 $ST $c_3,2*$BNSZ($a0)
2213 mflo ($t_1,$a_0,$a_3)
2214 mfhi ($t_2,$a_0,$a_3)
2216 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2217 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2218 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2219 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2221 $ST $c_1,3*$BNSZ($a0)
2223 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2224 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2228 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
2233 $ST $c_2,4*$BNSZ($a0)
2234 mflo ($t_1,$a_2,$a_3)
2235 mfhi ($t_2,$a_2,$a_3)
2237 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2238 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2240 $ST $c_3,5*$BNSZ($a0)
2246 $ST $c_1,6*$BNSZ($a0)
2247 $ST $c_2,7*$BNSZ($a0)
2251 $code.=<<___ if ($flavour =~ /nubi/i);
2252 $REG_L $t3,4*$SZREG($sp)
2253 $REG_L $t2,3*$SZREG($sp)
2254 $REG_L $t1,2*$SZREG($sp)
2255 $REG_L $t0,1*$SZREG($sp)
2256 $REG_L $gp,0*$SZREG($sp)
2257 $PTR_ADD $sp,6*$SZREG
2265 close STDOUT or die "error closing STDOUT: $!";