2 # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
14 # Rights for redistribution and usage in source and binary forms are
15 # granted according to the OpenSSL license. Warranty of any kind is
17 # ====================================================================
22 # This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
24 # The module is designed to work with either of the "new" MIPS ABI(5),
25 # namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
26 # IRIX 5.x not only because it doesn't support new ABIs but also
27 # because 5.x kernels put R4x00 CPU into 32-bit mode and all those
28 # 64-bit instructions (daddu, dmultu, etc.) found below gonna only
29 # cause illegal instruction exception:-(
31 # In addition the code depends on preprocessor flags set up by MIPSpro
32 # compiler driver (either as or cc) and therefore (probably?) can't be
33 # compiled by the GNU assembler. GNU C driver manages fine though...
34 # I mean as long as -mmips-as is specified or is the default option,
35 # because then it simply invokes /usr/bin/as which in turn takes
36 # perfect care of the preprocessor definitions. Another neat feature
37 # offered by the MIPSpro assembler is an optimization pass. This gave
38 # me the opportunity to have the code looking more regular as all those
39 # architecture dependent instruction rescheduling details were left to
40 # the assembler. Cool, huh?
42 # Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
43 # goes way over 3 times faster!
45 # <appro@fy.chalmers.se>
49 # Adapt the module even for 32-bit ABIs and other OSes. The former was
50 # achieved by mechanical replacement of 64-bit arithmetic instructions
51 # such as dmultu, daddu, etc. with their 32-bit counterparts and
52 # adjusting offsets denoting multiples of BN_ULONG. Above mentioned
53 # >3x performance improvement naturally does not apply to 32-bit code
54 # [because there is no instruction 32-bit compiler can't use], one
55 # has to content with 40-85% improvement depending on benchmark and
56 # key length, more for longer keys.
58 $flavour = shift || "o32";
59 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
60 open STDOUT,">$output";
62 if ($flavour =~ /64|n32/i) {
95 # Below is N32/64 register layout used in the original module.
97 ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
98 ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
99 ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
100 ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
101 ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
102 ($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
104 # No special adaptation is required for O32. NUBI on the other hand
105 # is treated by saving/restoring ($v1,$t0..$t3).
107 $gp=$v1 if ($flavour =~ /nubi/i);
112 #include "mips_arch.h"
114 #if defined(_MIPS_ARCH_MIPS64R6)
115 # define ddivu(rs,rt)
116 # define mfqt(rd,rs,rt) ddivu rd,rs,rt
117 # define mfrm(rd,rs,rt) dmodu rd,rs,rt
118 #elif defined(_MIPS_ARCH_MIPS32R6)
120 # define mfqt(rd,rs,rt) divu rd,rs,rt
121 # define mfrm(rd,rs,rt) modu rd,rs,rt
123 # define $DIVU(rs,rt) $DIVU $zero,rs,rt
124 # define mfqt(rd,rs,rt) mflo rd
125 # define mfrm(rd,rs,rt) mfhi rd
129 .asciiz "mips3.s, Version 1.2"
130 .asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
136 .globl bn_mul_add_words
137 .ent bn_mul_add_words
140 bgtz $a2,bn_mul_add_words_internal
144 .end bn_mul_add_words
147 .ent bn_mul_add_words_internal
148 bn_mul_add_words_internal:
150 $code.=<<___ if ($flavour =~ /nubi/i);
151 .frame $sp,6*$SZREG,$ra
152 .mask 0x8000f008,-$SZREG
154 $PTR_SUB $sp,6*$SZREG
155 $REG_S $ra,5*$SZREG($sp)
156 $REG_S $t3,4*$SZREG($sp)
157 $REG_S $t2,3*$SZREG($sp)
158 $REG_S $t1,2*$SZREG($sp)
159 $REG_S $t0,1*$SZREG($sp)
160 $REG_S $gp,0*$SZREG($sp)
166 beqz $ta0,.L_bn_mul_add_words_tail
168 .L_bn_mul_add_words_loop:
174 $LD $ta0,2*$BNSZ($a1)
175 $LD $ta1,2*$BNSZ($a0)
177 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
178 # values", but it seems to work fine
179 # even on 64-bit registers.
189 $LD $ta2,3*$BNSZ($a1)
190 $LD $ta3,3*$BNSZ($a0)
213 $ST $ta1,-2*$BNSZ($a0)
227 bgtz $ta0,.L_bn_mul_add_words_loop
230 beqz $a2,.L_bn_mul_add_words_return
233 .L_bn_mul_add_words_tail:
248 beqz $a2,.L_bn_mul_add_words_return
263 beqz $a2,.L_bn_mul_add_words_return
278 .L_bn_mul_add_words_return:
281 $code.=<<___ if ($flavour =~ /nubi/i);
282 $REG_L $t3,4*$SZREG($sp)
283 $REG_L $t2,3*$SZREG($sp)
284 $REG_L $t1,2*$SZREG($sp)
285 $REG_L $t0,1*$SZREG($sp)
286 $REG_L $gp,0*$SZREG($sp)
287 $PTR_ADD $sp,6*$SZREG
292 .end bn_mul_add_words_internal
299 bgtz $a2,bn_mul_words_internal
306 .ent bn_mul_words_internal
307 bn_mul_words_internal:
309 $code.=<<___ if ($flavour =~ /nubi/i);
310 .frame $sp,6*$SZREG,$ra
311 .mask 0x8000f008,-$SZREG
313 $PTR_SUB $sp,6*$SZREG
314 $REG_S $ra,5*$SZREG($sp)
315 $REG_S $t3,4*$SZREG($sp)
316 $REG_S $t2,3*$SZREG($sp)
317 $REG_S $t1,2*$SZREG($sp)
318 $REG_S $t0,1*$SZREG($sp)
319 $REG_S $gp,0*$SZREG($sp)
325 beqz $ta0,.L_bn_mul_words_tail
327 .L_bn_mul_words_loop:
331 $LD $ta0,2*$BNSZ($a1)
332 $LD $ta2,3*$BNSZ($a1)
349 $ST $v0,-3*$BNSZ($a0)
357 $ST $v0,-2*$BNSZ($a0)
367 bgtz $ta0,.L_bn_mul_words_loop
370 beqz $a2,.L_bn_mul_words_return
373 .L_bn_mul_words_tail:
384 beqz $a2,.L_bn_mul_words_return
395 beqz $a2,.L_bn_mul_words_return
406 .L_bn_mul_words_return:
409 $code.=<<___ if ($flavour =~ /nubi/i);
410 $REG_L $t3,4*$SZREG($sp)
411 $REG_L $t2,3*$SZREG($sp)
412 $REG_L $t1,2*$SZREG($sp)
413 $REG_L $t0,1*$SZREG($sp)
414 $REG_L $gp,0*$SZREG($sp)
415 $PTR_ADD $sp,6*$SZREG
420 .end bn_mul_words_internal
427 bgtz $a2,bn_sqr_words_internal
434 .ent bn_sqr_words_internal
435 bn_sqr_words_internal:
437 $code.=<<___ if ($flavour =~ /nubi/i);
438 .frame $sp,6*$SZREG,$ra
439 .mask 0x8000f008,-$SZREG
441 $PTR_SUB $sp,6*$SZREG
442 $REG_S $ra,5*$SZREG($sp)
443 $REG_S $t3,4*$SZREG($sp)
444 $REG_S $t2,3*$SZREG($sp)
445 $REG_S $t1,2*$SZREG($sp)
446 $REG_S $t0,1*$SZREG($sp)
447 $REG_S $gp,0*$SZREG($sp)
453 beqz $ta0,.L_bn_sqr_words_tail
455 .L_bn_sqr_words_loop:
459 $LD $ta0,2*$BNSZ($a1)
460 $LD $ta2,3*$BNSZ($a1)
472 $ST $t3,-6*$BNSZ($a0)
473 $ST $t2,-5*$BNSZ($a0)
476 mflo ($ta1,$ta0,$ta0)
477 mfhi ($ta0,$ta0,$ta0)
478 $ST $ta1,-4*$BNSZ($a0)
479 $ST $ta0,-3*$BNSZ($a0)
484 mflo ($ta3,$ta2,$ta2)
485 mfhi ($ta2,$ta2,$ta2)
486 $ST $ta3,-2*$BNSZ($a0)
489 bgtz $ta0,.L_bn_sqr_words_loop
492 beqz $a2,.L_bn_sqr_words_return
495 .L_bn_sqr_words_tail:
504 beqz $a2,.L_bn_sqr_words_return
513 beqz $a2,.L_bn_sqr_words_return
522 .L_bn_sqr_words_return:
525 $code.=<<___ if ($flavour =~ /nubi/i);
526 $REG_L $t3,4*$SZREG($sp)
527 $REG_L $t2,3*$SZREG($sp)
528 $REG_L $t1,2*$SZREG($sp)
529 $REG_L $t0,1*$SZREG($sp)
530 $REG_L $gp,0*$SZREG($sp)
531 $PTR_ADD $sp,6*$SZREG
537 .end bn_sqr_words_internal
544 bgtz $a3,bn_add_words_internal
551 .ent bn_add_words_internal
552 bn_add_words_internal:
554 $code.=<<___ if ($flavour =~ /nubi/i);
555 .frame $sp,6*$SZREG,$ra
556 .mask 0x8000f008,-$SZREG
558 $PTR_SUB $sp,6*$SZREG
559 $REG_S $ra,5*$SZREG($sp)
560 $REG_S $t3,4*$SZREG($sp)
561 $REG_S $t2,3*$SZREG($sp)
562 $REG_S $t1,2*$SZREG($sp)
563 $REG_S $t0,1*$SZREG($sp)
564 $REG_S $gp,0*$SZREG($sp)
570 beqz $at,.L_bn_add_words_tail
572 .L_bn_add_words_loop:
582 $LD $ta1,-3*$BNSZ($a2)
584 $LD $ta2,-2*$BNSZ($a2)
590 $ST $t0,-4*$BNSZ($a0)
597 $ST $t1,-3*$BNSZ($a0)
604 $ST $t2,-2*$BNSZ($a0)
614 bgtz $at,.L_bn_add_words_loop
617 beqz $a3,.L_bn_add_words_return
620 .L_bn_add_words_tail:
631 beqz $a3,.L_bn_add_words_return
642 beqz $a3,.L_bn_add_words_return
645 $LD $ta2,2*$BNSZ($a2)
653 .L_bn_add_words_return:
656 $code.=<<___ if ($flavour =~ /nubi/i);
657 $REG_L $t3,4*$SZREG($sp)
658 $REG_L $t2,3*$SZREG($sp)
659 $REG_L $t1,2*$SZREG($sp)
660 $REG_L $t0,1*$SZREG($sp)
661 $REG_L $gp,0*$SZREG($sp)
662 $PTR_ADD $sp,6*$SZREG
668 .end bn_add_words_internal
675 bgtz $a3,bn_sub_words_internal
682 .ent bn_sub_words_internal
683 bn_sub_words_internal:
685 $code.=<<___ if ($flavour =~ /nubi/i);
686 .frame $sp,6*$SZREG,$ra
687 .mask 0x8000f008,-$SZREG
689 $PTR_SUB $sp,6*$SZREG
690 $REG_S $ra,5*$SZREG($sp)
691 $REG_S $t3,4*$SZREG($sp)
692 $REG_S $t2,3*$SZREG($sp)
693 $REG_S $t1,2*$SZREG($sp)
694 $REG_S $t0,1*$SZREG($sp)
695 $REG_S $gp,0*$SZREG($sp)
701 beqz $at,.L_bn_sub_words_tail
703 .L_bn_sub_words_loop:
713 $LD $ta1,-3*$BNSZ($a2)
715 $LD $ta2,-2*$BNSZ($a2)
721 $ST $t0,-4*$BNSZ($a0)
728 $ST $t1,-3*$BNSZ($a0)
736 $ST $t2,-2*$BNSZ($a0)
746 bgtz $at,.L_bn_sub_words_loop
749 beqz $a3,.L_bn_sub_words_return
752 .L_bn_sub_words_tail:
763 beqz $a3,.L_bn_sub_words_return
774 beqz $a3,.L_bn_sub_words_return
777 $LD $ta2,2*$BNSZ($a2)
785 .L_bn_sub_words_return:
788 $code.=<<___ if ($flavour =~ /nubi/i);
789 $REG_L $t3,4*$SZREG($sp)
790 $REG_L $t2,3*$SZREG($sp)
791 $REG_L $t1,2*$SZREG($sp)
792 $REG_L $t0,1*$SZREG($sp)
793 $REG_L $gp,0*$SZREG($sp)
794 $PTR_ADD $sp,6*$SZREG
799 .end bn_sub_words_internal
802 .globl bn_div_3_words
806 move $a3,$a0 # we know that bn_div_words does not
807 # touch $a3, $ta2, $ta3 and preserves $a2
808 # so that we can save two arguments
809 # and return address in registers
810 # instead of stack:-)
814 bne $a0,$a2,bn_div_3_words_internal
822 .ent bn_div_3_words_internal
823 bn_div_3_words_internal:
825 $code.=<<___ if ($flavour =~ /nubi/i);
826 .frame $sp,6*$SZREG,$ra
827 .mask 0x8000f008,-$SZREG
829 $PTR_SUB $sp,6*$SZREG
830 $REG_S $ra,5*$SZREG($sp)
831 $REG_S $t3,4*$SZREG($sp)
832 $REG_S $t2,3*$SZREG($sp)
833 $REG_S $t1,2*$SZREG($sp)
834 $REG_S $t0,1*$SZREG($sp)
835 $REG_S $gp,0*$SZREG($sp)
840 bal bn_div_words_internal
843 $LD $t2,-2*$BNSZ($a3)
848 .L_bn_div_3_words_inner_loop:
849 bnez $t8,.L_bn_div_3_words_inner_loop_done
861 beqz $at,.L_bn_div_3_words_inner_loop
865 .L_bn_div_3_words_inner_loop_done:
868 $code.=<<___ if ($flavour =~ /nubi/i);
869 $REG_L $t3,4*$SZREG($sp)
870 $REG_L $t2,3*$SZREG($sp)
871 $REG_L $t1,2*$SZREG($sp)
872 $REG_L $t0,1*$SZREG($sp)
873 $REG_L $gp,0*$SZREG($sp)
874 $PTR_ADD $sp,6*$SZREG
879 .end bn_div_3_words_internal
886 bnez $a2,bn_div_words_internal
887 li $v0,-1 # I would rather signal div-by-zero
888 # which can be done with 'break 7'
894 .ent bn_div_words_internal
895 bn_div_words_internal:
897 $code.=<<___ if ($flavour =~ /nubi/i);
898 .frame $sp,6*$SZREG,$ra
899 .mask 0x8000f008,-$SZREG
901 $PTR_SUB $sp,6*$SZREG
902 $REG_S $ra,5*$SZREG($sp)
903 $REG_S $t3,4*$SZREG($sp)
904 $REG_S $t2,3*$SZREG($sp)
905 $REG_S $t1,2*$SZREG($sp)
906 $REG_S $t0,1*$SZREG($sp)
907 $REG_S $gp,0*$SZREG($sp)
911 bltz $a2,.L_bn_div_words_body
926 break 6 # signal overflow
936 .L_bn_div_words_body:
937 $SRL $DH,$a2,4*$BNSZ # bits
946 $SRL $HH,$a0,4*$BNSZ # bits
947 $SRL $QT,4*$BNSZ # q=0xffffffff
948 beq $DH,$HH,.L_bn_div_words_skip_div1
951 .L_bn_div_words_skip_div1:
953 $SLL $t3,$a0,4*$BNSZ # bits
954 $SRL $at,$a1,4*$BNSZ # bits
958 .L_bn_div_words_inner_loop1:
966 beqz $at,.L_bn_div_words_inner_loop1_done
969 b .L_bn_div_words_inner_loop1
972 .L_bn_div_words_inner_loop1_done:
974 $SLL $a1,4*$BNSZ # bits
976 $SLL $v0,$QT,4*$BNSZ # bits
979 $SRL $HH,$a0,4*$BNSZ # bits
980 $SRL $QT,4*$BNSZ # q=0xffffffff
981 beq $DH,$HH,.L_bn_div_words_skip_div2
984 .L_bn_div_words_skip_div2:
986 $SLL $t3,$a0,4*$BNSZ # bits
987 $SRL $at,$a1,4*$BNSZ # bits
991 .L_bn_div_words_inner_loop2:
999 beqz $at,.L_bn_div_words_inner_loop2_done
1002 b .L_bn_div_words_inner_loop2
1005 .L_bn_div_words_inner_loop2_done:
1009 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
1010 $SRL $a2,$t9 # restore $a2
1015 $code.=<<___ if ($flavour =~ /nubi/i);
1016 $REG_L $t3,4*$SZREG($sp)
1017 $REG_L $t2,3*$SZREG($sp)
1018 $REG_L $t1,2*$SZREG($sp)
1019 $REG_L $t0,1*$SZREG($sp)
1020 $REG_L $gp,0*$SZREG($sp)
1021 $PTR_ADD $sp,6*$SZREG
1026 .end bn_div_words_internal
1028 undef $HH; undef $QT; undef $DH;
1030 ($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1031 ($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1033 ($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1034 ($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1036 ($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1041 .globl bn_mul_comba8
1046 $code.=<<___ if ($flavour =~ /nubi/i);
1047 .frame $sp,12*$SZREG,$ra
1048 .mask 0x803ff008,-$SZREG
1049 $PTR_SUB $sp,12*$SZREG
1050 $REG_S $ra,11*$SZREG($sp)
1051 $REG_S $s5,10*$SZREG($sp)
1052 $REG_S $s4,9*$SZREG($sp)
1053 $REG_S $s3,8*$SZREG($sp)
1054 $REG_S $s2,7*$SZREG($sp)
1055 $REG_S $s1,6*$SZREG($sp)
1056 $REG_S $s0,5*$SZREG($sp)
1057 $REG_S $t3,4*$SZREG($sp)
1058 $REG_S $t2,3*$SZREG($sp)
1059 $REG_S $t1,2*$SZREG($sp)
1060 $REG_S $t0,1*$SZREG($sp)
1061 $REG_S $gp,0*$SZREG($sp)
1063 $code.=<<___ if ($flavour !~ /nubi/i);
1064 .frame $sp,6*$SZREG,$ra
1065 .mask 0x003f0000,-$SZREG
1066 $PTR_SUB $sp,6*$SZREG
1067 $REG_S $s5,5*$SZREG($sp)
1068 $REG_S $s4,4*$SZREG($sp)
1069 $REG_S $s3,3*$SZREG($sp)
1070 $REG_S $s2,2*$SZREG($sp)
1071 $REG_S $s1,1*$SZREG($sp)
1072 $REG_S $s0,0*$SZREG($sp)
1077 $LD $a_0,0($a1) # If compiled with -mips3 option on
1078 # R5000 box assembler barks on this
1079 # 1ine with "should not have mult/div
1080 # as last instruction in bb (R10K
1081 # bug)" warning. If anybody out there
1082 # has a clue about how to circumvent
1083 # this do send me a note.
1084 # <appro\@fy.chalmers.se>
1088 $LD $a_2,2*$BNSZ($a1)
1089 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1090 $LD $a_3,3*$BNSZ($a1)
1092 $LD $b_2,2*$BNSZ($a2)
1093 $LD $b_3,3*$BNSZ($a2)
1094 mflo ($c_1,$a_0,$b_0)
1095 mfhi ($c_2,$a_0,$b_0)
1097 $LD $a_4,4*$BNSZ($a1)
1098 $LD $a_5,5*$BNSZ($a1)
1099 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1100 $LD $a_6,6*$BNSZ($a1)
1101 $LD $a_7,7*$BNSZ($a1)
1102 $LD $b_4,4*$BNSZ($a2)
1103 $LD $b_5,5*$BNSZ($a2)
1104 mflo ($t_1,$a_0,$b_1)
1105 mfhi ($t_2,$a_0,$b_1)
1108 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1110 $LD $b_6,6*$BNSZ($a2)
1111 $LD $b_7,7*$BNSZ($a2)
1112 $ST $c_1,0($a0) # r[0]=c1;
1113 mflo ($t_1,$a_1,$b_0)
1114 mfhi ($t_2,$a_1,$b_0)
1117 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1121 $ST $c_2,$BNSZ($a0) # r[1]=c2;
1123 mflo ($t_1,$a_2,$b_0)
1124 mfhi ($t_2,$a_2,$b_0)
1127 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1130 mflo ($t_1,$a_1,$b_1)
1131 mfhi ($t_2,$a_1,$b_1)
1134 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1138 mflo ($t_1,$a_0,$b_2)
1139 mfhi ($t_2,$a_0,$b_2)
1142 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1147 $ST $c_3,2*$BNSZ($a0) # r[2]=c3;
1149 mflo ($t_1,$a_0,$b_3)
1150 mfhi ($t_2,$a_0,$b_3)
1153 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1157 mflo ($t_1,$a_1,$b_2)
1158 mfhi ($t_2,$a_1,$b_2)
1161 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1166 mflo ($t_1,$a_2,$b_1)
1167 mfhi ($t_2,$a_2,$b_1)
1170 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1175 mflo ($t_1,$a_3,$b_0)
1176 mfhi ($t_2,$a_3,$b_0)
1179 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
1184 $ST $c_1,3*$BNSZ($a0) # r[3]=c1;
1186 mflo ($t_1,$a_4,$b_0)
1187 mfhi ($t_2,$a_4,$b_0)
1190 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1194 mflo ($t_1,$a_3,$b_1)
1195 mfhi ($t_2,$a_3,$b_1)
1198 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1203 mflo ($t_1,$a_2,$b_2)
1204 mfhi ($t_2,$a_2,$b_2)
1207 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1212 mflo ($t_1,$a_1,$b_3)
1213 mfhi ($t_2,$a_1,$b_3)
1216 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
1221 mflo ($t_1,$a_0,$b_4)
1222 mfhi ($t_2,$a_0,$b_4)
1225 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
1230 $ST $c_2,4*$BNSZ($a0) # r[4]=c2;
1232 mflo ($t_1,$a_0,$b_5)
1233 mfhi ($t_2,$a_0,$b_5)
1236 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
1240 mflo ($t_1,$a_1,$b_4)
1241 mfhi ($t_2,$a_1,$b_4)
1244 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1249 mflo ($t_1,$a_2,$b_3)
1250 mfhi ($t_2,$a_2,$b_3)
1253 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1258 mflo ($t_1,$a_3,$b_2)
1259 mfhi ($t_2,$a_3,$b_2)
1262 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
1267 mflo ($t_1,$a_4,$b_1)
1268 mfhi ($t_2,$a_4,$b_1)
1271 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
1276 mflo ($t_1,$a_5,$b_0)
1277 mfhi ($t_2,$a_5,$b_0)
1280 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
1285 $ST $c_3,5*$BNSZ($a0) # r[5]=c3;
1287 mflo ($t_1,$a_6,$b_0)
1288 mfhi ($t_2,$a_6,$b_0)
1291 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
1295 mflo ($t_1,$a_5,$b_1)
1296 mfhi ($t_2,$a_5,$b_1)
1299 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
1304 mflo ($t_1,$a_4,$b_2)
1305 mfhi ($t_2,$a_4,$b_2)
1308 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1313 mflo ($t_1,$a_3,$b_3)
1314 mfhi ($t_2,$a_3,$b_3)
1317 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
1322 mflo ($t_1,$a_2,$b_4)
1323 mfhi ($t_2,$a_2,$b_4)
1326 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
1331 mflo ($t_1,$a_1,$b_5)
1332 mfhi ($t_2,$a_1,$b_5)
1335 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
1340 mflo ($t_1,$a_0,$b_6)
1341 mfhi ($t_2,$a_0,$b_6)
1344 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
1349 $ST $c_1,6*$BNSZ($a0) # r[6]=c1;
1351 mflo ($t_1,$a_0,$b_7)
1352 mfhi ($t_2,$a_0,$b_7)
1355 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
1359 mflo ($t_1,$a_1,$b_6)
1360 mfhi ($t_2,$a_1,$b_6)
1363 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
1368 mflo ($t_1,$a_2,$b_5)
1369 mfhi ($t_2,$a_2,$b_5)
1372 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
1377 mflo ($t_1,$a_3,$b_4)
1378 mfhi ($t_2,$a_3,$b_4)
1381 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
1386 mflo ($t_1,$a_4,$b_3)
1387 mfhi ($t_2,$a_4,$b_3)
1390 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
1395 mflo ($t_1,$a_5,$b_2)
1396 mfhi ($t_2,$a_5,$b_2)
1399 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
1404 mflo ($t_1,$a_6,$b_1)
1405 mfhi ($t_2,$a_6,$b_1)
1408 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
1413 mflo ($t_1,$a_7,$b_0)
1414 mfhi ($t_2,$a_7,$b_0)
1417 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
1422 $ST $c_2,7*$BNSZ($a0) # r[7]=c2;
1424 mflo ($t_1,$a_7,$b_1)
1425 mfhi ($t_2,$a_7,$b_1)
1428 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
1432 mflo ($t_1,$a_6,$b_2)
1433 mfhi ($t_2,$a_6,$b_2)
1436 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
1441 mflo ($t_1,$a_5,$b_3)
1442 mfhi ($t_2,$a_5,$b_3)
1445 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
1450 mflo ($t_1,$a_4,$b_4)
1451 mfhi ($t_2,$a_4,$b_4)
1454 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
1459 mflo ($t_1,$a_3,$b_5)
1460 mfhi ($t_2,$a_3,$b_5)
1463 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
1468 mflo ($t_1,$a_2,$b_6)
1469 mfhi ($t_2,$a_2,$b_6)
1472 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
1477 mflo ($t_1,$a_1,$b_7)
1478 mfhi ($t_2,$a_1,$b_7)
1481 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
1486 $ST $c_3,8*$BNSZ($a0) # r[8]=c3;
1488 mflo ($t_1,$a_2,$b_7)
1489 mfhi ($t_2,$a_2,$b_7)
1492 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
1496 mflo ($t_1,$a_3,$b_6)
1497 mfhi ($t_2,$a_3,$b_6)
1500 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
1505 mflo ($t_1,$a_4,$b_5)
1506 mfhi ($t_2,$a_4,$b_5)
1509 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
1514 mflo ($t_1,$a_5,$b_4)
1515 mfhi ($t_2,$a_5,$b_4)
1518 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
1523 mflo ($t_1,$a_6,$b_3)
1524 mfhi ($t_2,$a_6,$b_3)
1527 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
1532 mflo ($t_1,$a_7,$b_2)
1533 mfhi ($t_2,$a_7,$b_2)
1536 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
1541 $ST $c_1,9*$BNSZ($a0) # r[9]=c1;
1543 mflo ($t_1,$a_7,$b_3)
1544 mfhi ($t_2,$a_7,$b_3)
1547 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
1551 mflo ($t_1,$a_6,$b_4)
1552 mfhi ($t_2,$a_6,$b_4)
1555 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
1560 mflo ($t_1,$a_5,$b_5)
1561 mfhi ($t_2,$a_5,$b_5)
1564 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
1569 mflo ($t_1,$a_4,$b_6)
1570 mfhi ($t_2,$a_4,$b_6)
1573 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
1578 mflo ($t_1,$a_3,$b_7)
1579 mfhi ($t_2,$a_3,$b_7)
1582 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
1587 $ST $c_2,10*$BNSZ($a0) # r[10]=c2;
1589 mflo ($t_1,$a_4,$b_7)
1590 mfhi ($t_2,$a_4,$b_7)
1593 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
1597 mflo ($t_1,$a_5,$b_6)
1598 mfhi ($t_2,$a_5,$b_6)
1601 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
1606 mflo ($t_1,$a_6,$b_5)
1607 mfhi ($t_2,$a_6,$b_5)
1610 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
1615 mflo ($t_1,$a_7,$b_4)
1616 mfhi ($t_2,$a_7,$b_4)
1619 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
1624 $ST $c_3,11*$BNSZ($a0) # r[11]=c3;
1626 mflo ($t_1,$a_7,$b_5)
1627 mfhi ($t_2,$a_7,$b_5)
1630 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
1634 mflo ($t_1,$a_6,$b_6)
1635 mfhi ($t_2,$a_6,$b_6)
1638 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
1643 mflo ($t_1,$a_5,$b_7)
1644 mfhi ($t_2,$a_5,$b_7)
1647 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
1652 $ST $c_1,12*$BNSZ($a0) # r[12]=c1;
1654 mflo ($t_1,$a_6,$b_7)
1655 mfhi ($t_2,$a_6,$b_7)
1658 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
1662 mflo ($t_1,$a_7,$b_6)
1663 mfhi ($t_2,$a_7,$b_6)
1666 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
1671 $ST $c_2,13*$BNSZ($a0) # r[13]=c2;
1673 mflo ($t_1,$a_7,$b_7)
1674 mfhi ($t_2,$a_7,$b_7)
1679 $ST $c_3,14*$BNSZ($a0) # r[14]=c3;
1680 $ST $c_1,15*$BNSZ($a0) # r[15]=c1;
1684 $code.=<<___ if ($flavour =~ /nubi/i);
1685 $REG_L $s5,10*$SZREG($sp)
1686 $REG_L $s4,9*$SZREG($sp)
1687 $REG_L $s3,8*$SZREG($sp)
1688 $REG_L $s2,7*$SZREG($sp)
1689 $REG_L $s1,6*$SZREG($sp)
1690 $REG_L $s0,5*$SZREG($sp)
1691 $REG_L $t3,4*$SZREG($sp)
1692 $REG_L $t2,3*$SZREG($sp)
1693 $REG_L $t1,2*$SZREG($sp)
1694 $REG_L $t0,1*$SZREG($sp)
1695 $REG_L $gp,0*$SZREG($sp)
1697 $PTR_ADD $sp,12*$SZREG
1699 $code.=<<___ if ($flavour !~ /nubi/i);
1700 $REG_L $s5,5*$SZREG($sp)
1701 $REG_L $s4,4*$SZREG($sp)
1702 $REG_L $s3,3*$SZREG($sp)
1703 $REG_L $s2,2*$SZREG($sp)
1704 $REG_L $s1,1*$SZREG($sp)
1705 $REG_L $s0,0*$SZREG($sp)
1707 $PTR_ADD $sp,6*$SZREG
1713 .globl bn_mul_comba4
1717 $code.=<<___ if ($flavour =~ /nubi/i);
1718 .frame $sp,6*$SZREG,$ra
1719 .mask 0x8000f008,-$SZREG
1721 $PTR_SUB $sp,6*$SZREG
1722 $REG_S $ra,5*$SZREG($sp)
1723 $REG_S $t3,4*$SZREG($sp)
1724 $REG_S $t2,3*$SZREG($sp)
1725 $REG_S $t1,2*$SZREG($sp)
1726 $REG_S $t0,1*$SZREG($sp)
1727 $REG_S $gp,0*$SZREG($sp)
1734 $LD $a_2,2*$BNSZ($a1)
1735 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1736 $LD $a_3,3*$BNSZ($a1)
1738 $LD $b_2,2*$BNSZ($a2)
1739 $LD $b_3,3*$BNSZ($a2)
1740 mflo ($c_1,$a_0,$b_0)
1741 mfhi ($c_2,$a_0,$b_0)
1744 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
1745 mflo ($t_1,$a_0,$b_1)
1746 mfhi ($t_2,$a_0,$b_1)
1749 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
1751 mflo ($t_1,$a_1,$b_0)
1752 mfhi ($t_2,$a_1,$b_0)
1755 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
1761 mflo ($t_1,$a_2,$b_0)
1762 mfhi ($t_2,$a_2,$b_0)
1765 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
1768 mflo ($t_1,$a_1,$b_1)
1769 mfhi ($t_2,$a_1,$b_1)
1772 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
1776 mflo ($t_1,$a_0,$b_2)
1777 mfhi ($t_2,$a_0,$b_2)
1780 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
1785 $ST $c_3,2*$BNSZ($a0)
1787 mflo ($t_1,$a_0,$b_3)
1788 mfhi ($t_2,$a_0,$b_3)
1791 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
1795 mflo ($t_1,$a_1,$b_2)
1796 mfhi ($t_2,$a_1,$b_2)
1799 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
1804 mflo ($t_1,$a_2,$b_1)
1805 mfhi ($t_2,$a_2,$b_1)
1808 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
1813 mflo ($t_1,$a_3,$b_0)
1814 mfhi ($t_2,$a_3,$b_0)
1817 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
1822 $ST $c_1,3*$BNSZ($a0)
1824 mflo ($t_1,$a_3,$b_1)
1825 mfhi ($t_2,$a_3,$b_1)
1828 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
1832 mflo ($t_1,$a_2,$b_2)
1833 mfhi ($t_2,$a_2,$b_2)
1836 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
1841 mflo ($t_1,$a_1,$b_3)
1842 mfhi ($t_2,$a_1,$b_3)
1845 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
1850 $ST $c_2,4*$BNSZ($a0)
1852 mflo ($t_1,$a_2,$b_3)
1853 mfhi ($t_2,$a_2,$b_3)
1856 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
1860 mflo ($t_1,$a_3,$b_2)
1861 mfhi ($t_2,$a_3,$b_2)
1864 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
1869 $ST $c_3,5*$BNSZ($a0)
1871 mflo ($t_1,$a_3,$b_3)
1872 mfhi ($t_2,$a_3,$b_3)
1877 $ST $c_1,6*$BNSZ($a0)
1878 $ST $c_2,7*$BNSZ($a0)
1882 $code.=<<___ if ($flavour =~ /nubi/i);
1883 $REG_L $t3,4*$SZREG($sp)
1884 $REG_L $t2,3*$SZREG($sp)
1885 $REG_L $t1,2*$SZREG($sp)
1886 $REG_L $t0,1*$SZREG($sp)
1887 $REG_L $gp,0*$SZREG($sp)
1888 $PTR_ADD $sp,6*$SZREG
1896 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1899 my ($hi,$lo,$c0,$c1,$c2,
1900 $warm, # !$warm denotes first call with specific sequence of
1901 # $c_[XYZ] when there is no Z-carry to accumulate yet;
1902 $an,$bn # these two are arguments for multiplication which
1903 # result is used in *next* step [which is why it's
1904 # commented as "forward multiplication" below];
1909 $MULTU ($an,$bn) # forward multiplication
1916 $code.=<<___ if (!$warm);
1920 $code.=<<___ if ($warm);
1936 .globl bn_sqr_comba8
1940 $code.=<<___ if ($flavour =~ /nubi/i);
1941 .frame $sp,6*$SZREG,$ra
1942 .mask 0x8000f008,-$SZREG
1944 $PTR_SUB $sp,6*$SZREG
1945 $REG_S $ra,5*$SZREG($sp)
1946 $REG_S $t3,4*$SZREG($sp)
1947 $REG_S $t2,3*$SZREG($sp)
1948 $REG_S $t1,2*$SZREG($sp)
1949 $REG_S $t0,1*$SZREG($sp)
1950 $REG_S $gp,0*$SZREG($sp)
1956 $LD $a_2,2*$BNSZ($a1)
1957 $LD $a_3,3*$BNSZ($a1)
1959 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
1960 $LD $a_4,4*$BNSZ($a1)
1961 $LD $a_5,5*$BNSZ($a1)
1962 $LD $a_6,6*$BNSZ($a1)
1963 $LD $a_7,7*$BNSZ($a1)
1964 mflo ($c_1,$a_0,$a_0)
1965 mfhi ($c_2,$a_0,$a_0)
1968 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
1969 mflo ($t_1,$a_0,$a_1)
1970 mfhi ($t_2,$a_0,$a_1)
1973 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
1981 mflo ($t_1,$a_2,$a_0)
1982 mfhi ($t_2,$a_2,$a_0)
1984 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
1985 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
1989 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
1994 $ST $c_3,2*$BNSZ($a0)
1995 mflo ($t_1,$a_0,$a_3)
1996 mfhi ($t_2,$a_0,$a_3)
1998 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
1999 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
2000 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2001 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
2003 $ST $c_1,3*$BNSZ($a0)
2005 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2006 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2007 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2008 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2012 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
2017 $ST $c_2,4*$BNSZ($a0)
2018 mflo ($t_1,$a_0,$a_5)
2019 mfhi ($t_2,$a_0,$a_5)
2021 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2022 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
2023 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2024 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
2025 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2026 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
2028 $ST $c_3,5*$BNSZ($a0)
2030 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2031 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
2032 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2033 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
2034 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2035 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2039 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
2044 $ST $c_1,6*$BNSZ($a0)
2045 mflo ($t_1,$a_0,$a_7)
2046 mfhi ($t_2,$a_0,$a_7)
2048 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2049 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
2050 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2051 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
2052 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2053 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
2054 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2055 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
2057 $ST $c_2,7*$BNSZ($a0)
2059 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2060 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
2061 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2062 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
2063 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2064 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
2068 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
2073 $ST $c_3,8*$BNSZ($a0)
2074 mflo ($t_1,$a_2,$a_7)
2075 mfhi ($t_2,$a_2,$a_7)
2077 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2078 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
2079 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2080 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
2081 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2082 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
2084 $ST $c_1,9*$BNSZ($a0)
2086 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2087 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
2088 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
2089 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
2093 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
2098 $ST $c_2,10*$BNSZ($a0)
2099 mflo ($t_1,$a_4,$a_7)
2100 mfhi ($t_2,$a_4,$a_7)
2102 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2103 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
2104 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
2105 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
2107 $ST $c_3,11*$BNSZ($a0)
2109 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2110 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
2114 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
2119 $ST $c_1,12*$BNSZ($a0)
2120 mflo ($t_1,$a_6,$a_7)
2121 mfhi ($t_2,$a_6,$a_7)
2123 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2124 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
2126 $ST $c_2,13*$BNSZ($a0)
2132 $ST $c_3,14*$BNSZ($a0)
2133 $ST $c_1,15*$BNSZ($a0)
2137 $code.=<<___ if ($flavour =~ /nubi/i);
2138 $REG_L $t3,4*$SZREG($sp)
2139 $REG_L $t2,3*$SZREG($sp)
2140 $REG_L $t1,2*$SZREG($sp)
2141 $REG_L $t0,1*$SZREG($sp)
2142 $REG_L $gp,0*$SZREG($sp)
2143 $PTR_ADD $sp,6*$SZREG
2151 .globl bn_sqr_comba4
2155 $code.=<<___ if ($flavour =~ /nubi/i);
2156 .frame $sp,6*$SZREG,$ra
2157 .mask 0x8000f008,-$SZREG
2159 $PTR_SUB $sp,6*$SZREG
2160 $REG_S $ra,5*$SZREG($sp)
2161 $REG_S $t3,4*$SZREG($sp)
2162 $REG_S $t2,3*$SZREG($sp)
2163 $REG_S $t1,2*$SZREG($sp)
2164 $REG_S $t0,1*$SZREG($sp)
2165 $REG_S $gp,0*$SZREG($sp)
2171 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
2172 $LD $a_2,2*$BNSZ($a1)
2173 $LD $a_3,3*$BNSZ($a1)
2174 mflo ($c_1,$a_0,$a_0)
2175 mfhi ($c_2,$a_0,$a_0)
2178 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
2179 mflo ($t_1,$a_0,$a_1)
2180 mfhi ($t_2,$a_0,$a_1)
2183 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
2191 mflo ($t_1,$a_2,$a_0)
2192 mfhi ($t_2,$a_2,$a_0)
2194 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2195 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
2199 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
2204 $ST $c_3,2*$BNSZ($a0)
2205 mflo ($t_1,$a_0,$a_3)
2206 mfhi ($t_2,$a_0,$a_3)
2208 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
2209 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
2210 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
2211 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
2213 $ST $c_1,3*$BNSZ($a0)
2215 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
2216 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
2220 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
2225 $ST $c_2,4*$BNSZ($a0)
2226 mflo ($t_1,$a_2,$a_3)
2227 mfhi ($t_2,$a_2,$a_3)
2229 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
2230 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
2232 $ST $c_3,5*$BNSZ($a0)
2238 $ST $c_1,6*$BNSZ($a0)
2239 $ST $c_2,7*$BNSZ($a0)
2243 $code.=<<___ if ($flavour =~ /nubi/i);
2244 $REG_L $t3,4*$SZREG($sp)
2245 $REG_L $t2,3*$SZREG($sp)
2246 $REG_L $t1,2*$SZREG($sp)
2247 $REG_L $t0,1*$SZREG($sp)
2248 $REG_L $gp,0*$SZREG($sp)
2249 $PTR_ADD $sp,6*$SZREG