MIPS assembler pack update from HEAD.
authorAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 20:55:24 +0000 (20:55 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 20:55:24 +0000 (20:55 +0000)
crypto/aes/asm/aes-mips.pl [new file with mode: 0644]
crypto/bn/asm/mips-mont.pl [new file with mode: 0644]
crypto/bn/asm/mips.pl [new file with mode: 0644]
crypto/sha/asm/sha1-mips.pl [new file with mode: 0644]
crypto/sha/asm/sha512-mips.pl [new file with mode: 0644]

diff --git a/crypto/aes/asm/aes-mips.pl b/crypto/aes/asm/aes-mips.pl
new file mode 100644 (file)
index 0000000..2ce6def
--- /dev/null
@@ -0,0 +1,1611 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for MIPS
+
+# October 2010
+#
+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
+# faster than gcc-generated code, which is not very impressive. But
+# recall that compressed S-box requires extra processing, namely
+# additional rotations. Rotations are implemented with lwl/lwr pairs,
+# which is normally used for loading unaligned data. Another cool
+# thing about this module is its endian neutrality, which means that
+# it processes data without ever changing byte order...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+       $PTR_ADD="dadd";        # incidentally works even on n32
+       $PTR_SUB="dsub";        # incidentally works even on n32
+       $REG_S="sd";
+       $REG_L="ld";
+       $PTR_SLL="dsll";        # incidentally works even on n32
+       $SZREG=8;
+} else {
+       $PTR_ADD="add";
+       $PTR_SUB="sub";
+       $REG_S="sw";
+       $REG_L="lw";
+       $PTR_SLL="sll";
+       $SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {  $output=$_ if (/^\w[\w\-]*\.\w+$/);     }
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+{    $big_endian=(unpack('L',pack('N',1))==1);   }
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($MSB,$LSB)=(0,3);  # automatically converted to little-endian
+
+$code.=<<___;
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if !defined(__vxworks) || defined(__pic__)
+.option        pic2
+#endif
+.set   noat
+___
+\f
+{{{
+my $FRAMESIZE=16*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
+my ($key0,$cnt)=($gp,$fp);
+
+# instuction ordering is "stolen" from output from MIPSpro assembler
+# invoked with -mips3 -O3 arguments...
+$code.=<<___;
+.align 5
+.ent   _mips_AES_encrypt
+_mips_AES_encrypt:
+       .frame  $sp,0,$ra
+       .set    reorder
+       lw      $t0,0($key)
+       lw      $t1,4($key)
+       lw      $t2,8($key)
+       lw      $t3,12($key)
+       lw      $cnt,240($key)
+       $PTR_ADD $key0,$key,16
+
+       xor     $s0,$t0
+       xor     $s1,$t1
+       xor     $s2,$t2
+       xor     $s3,$t3
+
+       sub     $cnt,1
+       _xtr    $i0,$s1,16-2
+.Loop_enc:
+       _xtr    $i1,$s2,16-2
+       _xtr    $i2,$s3,16-2
+       _xtr    $i3,$s0,16-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lwl     $t0,3($i0)              # Te1[s1>>16]
+       lwl     $t1,3($i1)              # Te1[s2>>16]
+       lwl     $t2,3($i2)              # Te1[s3>>16]
+       lwl     $t3,3($i3)              # Te1[s0>>16]
+       lwr     $t0,2($i0)              # Te1[s1>>16]
+       lwr     $t1,2($i1)              # Te1[s2>>16]
+       lwr     $t2,2($i2)              # Te1[s3>>16]
+       lwr     $t3,2($i3)              # Te1[s0>>16]
+
+       _xtr    $i0,$s2,8-2
+       _xtr    $i1,$s3,8-2
+       _xtr    $i2,$s0,8-2
+       _xtr    $i3,$s1,8-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lwl     $t4,2($i0)              # Te2[s2>>8]
+       lwl     $t5,2($i1)              # Te2[s3>>8]
+       lwl     $t6,2($i2)              # Te2[s0>>8]
+       lwl     $t7,2($i3)              # Te2[s1>>8]
+       lwr     $t4,1($i0)              # Te2[s2>>8]
+       lwr     $t5,1($i1)              # Te2[s3>>8]
+       lwr     $t6,1($i2)              # Te2[s0>>8]
+       lwr     $t7,1($i3)              # Te2[s1>>8]
+
+       _xtr    $i0,$s3,0-2
+       _xtr    $i1,$s0,0-2
+       _xtr    $i2,$s1,0-2
+       _xtr    $i3,$s2,0-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lwl     $t8,1($i0)              # Te3[s3]
+       lwl     $t9,1($i1)              # Te3[s0]
+       lwl     $t10,1($i2)             # Te3[s1]
+       lwl     $t11,1($i3)             # Te3[s2]
+       lwr     $t8,0($i0)              # Te3[s3]
+       lwr     $t9,0($i1)              # Te3[s0]
+       lwr     $t10,0($i2)             # Te3[s1]
+       lwr     $t11,0($i3)             # Te3[s2]
+
+       _xtr    $i0,$s0,24-2
+       _xtr    $i1,$s1,24-2
+       _xtr    $i2,$s2,24-2
+       _xtr    $i3,$s3,24-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+       lw      $t4,0($i0)              # Te0[s0>>24]
+       lw      $t5,0($i1)              # Te0[s1>>24]
+       lw      $t6,0($i2)              # Te0[s2>>24]
+       lw      $t7,0($i3)              # Te0[s3>>24]
+
+       lw      $s0,0($key0)
+       lw      $s1,4($key0)
+       lw      $s2,8($key0)
+       lw      $s3,12($key0)
+
+       xor     $t0,$t8
+       xor     $t1,$t9
+       xor     $t2,$t10
+       xor     $t3,$t11
+
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+
+       sub     $cnt,1
+       $PTR_ADD $key0,16
+       xor     $s0,$t0
+       xor     $s1,$t1
+       xor     $s2,$t2
+       xor     $s3,$t3
+       .set    noreorder
+       bnez    $cnt,.Loop_enc
+       _xtr    $i0,$s1,16-2
+
+       .set    reorder
+       _xtr    $i1,$s2,16-2
+       _xtr    $i2,$s3,16-2
+       _xtr    $i3,$s0,16-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t0,2($i0)              # Te4[s1>>16]
+       lbu     $t1,2($i1)              # Te4[s2>>16]
+       lbu     $t2,2($i2)              # Te4[s3>>16]
+       lbu     $t3,2($i3)              # Te4[s0>>16]
+
+       _xtr    $i0,$s2,8-2
+       _xtr    $i1,$s3,8-2
+       _xtr    $i2,$s0,8-2
+       _xtr    $i3,$s1,8-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t4,2($i0)              # Te4[s2>>8]
+       lbu     $t5,2($i1)              # Te4[s3>>8]
+       lbu     $t6,2($i2)              # Te4[s0>>8]
+       lbu     $t7,2($i3)              # Te4[s1>>8]
+
+       _xtr    $i0,$s0,24-2
+       _xtr    $i1,$s1,24-2
+       _xtr    $i2,$s2,24-2
+       _xtr    $i3,$s3,24-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t8,2($i0)              # Te4[s0>>24]
+       lbu     $t9,2($i1)              # Te4[s1>>24]
+       lbu     $t10,2($i2)             # Te4[s2>>24]
+       lbu     $t11,2($i3)             # Te4[s3>>24]
+
+       _xtr    $i0,$s3,0-2
+       _xtr    $i1,$s0,0-2
+       _xtr    $i2,$s1,0-2
+       _xtr    $i3,$s2,0-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+
+       _ins    $t0,16
+       _ins    $t1,16
+       _ins    $t2,16
+       _ins    $t3,16
+
+       _ins    $t4,8
+       _ins    $t5,8
+       _ins    $t6,8
+       _ins    $t7,8
+
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t4,2($i0)              # Te4[s3]
+       lbu     $t5,2($i1)              # Te4[s0]
+       lbu     $t6,2($i2)              # Te4[s1]
+       lbu     $t7,2($i3)              # Te4[s2]
+
+       _ins    $t8,24
+       _ins    $t9,24
+       _ins    $t10,24
+       _ins    $t11,24
+
+       lw      $s0,0($key0)
+       lw      $s1,4($key0)
+       lw      $s2,8($key0)
+       lw      $s3,12($key0)
+
+       xor     $t0,$t8
+       xor     $t1,$t9
+       xor     $t2,$t10
+       xor     $t3,$t11
+
+       _ins    $t4,0
+       _ins    $t5,0
+       _ins    $t6,0
+       _ins    $t7,0
+
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+
+       xor     $s0,$t0
+       xor     $s1,$t1
+       xor     $s2,$t2
+       xor     $s3,$t3
+
+       jr      $ra
+.end   _mips_AES_encrypt
+
+.align 5
+.globl AES_encrypt
+.ent   AES_encrypt
+AES_encrypt:
+       .frame  $sp,$FRAMESIZE,$ra
+       .mask   $SAVED_REGS_MASK,-$SZREG
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);  # o32 PIC-ification
+       .cpload $pf
+___
+$code.=<<___;
+       $PTR_SUB $sp,$FRAMESIZE
+       $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+       $REG_S  $s11,$FRAMESIZE-3*$SZREG($sp)
+       $REG_S  $s10,$FRAMESIZE-4*$SZREG($sp)
+       $REG_S  $s9,$FRAMESIZE-5*$SZREG($sp)
+       $REG_S  $s8,$FRAMESIZE-6*$SZREG($sp)
+       $REG_S  $s7,$FRAMESIZE-7*$SZREG($sp)
+       $REG_S  $s6,$FRAMESIZE-8*$SZREG($sp)
+       $REG_S  $s5,$FRAMESIZE-9*$SZREG($sp)
+       $REG_S  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+       $REG_S  \$15,$FRAMESIZE-11*$SZREG($sp)
+       $REG_S  \$14,$FRAMESIZE-12*$SZREG($sp)
+       $REG_S  \$13,$FRAMESIZE-13*$SZREG($sp)
+       $REG_S  \$12,$FRAMESIZE-14*$SZREG($sp)
+       $REG_S  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);  # non-o32 PIC-ification
+       .cplocal        $Tbl
+       .cpsetup        $pf,$zero,AES_encrypt
+___
+$code.=<<___;
+       .set    reorder
+       la      $Tbl,AES_Te             # PIC-ified 'load address'
+
+       lwl     $s0,0+$MSB($inp)
+       lwl     $s1,4+$MSB($inp)
+       lwl     $s2,8+$MSB($inp)
+       lwl     $s3,12+$MSB($inp)
+       lwr     $s0,0+$LSB($inp)
+       lwr     $s1,4+$LSB($inp)
+       lwr     $s2,8+$LSB($inp)
+       lwr     $s3,12+$LSB($inp)
+
+       bal     _mips_AES_encrypt
+
+       swr     $s0,0+$LSB($out)
+       swr     $s1,4+$LSB($out)
+       swr     $s2,8+$LSB($out)
+       swr     $s3,12+$LSB($out)
+       swl     $s0,0+$MSB($out)
+       swl     $s1,4+$MSB($out)
+       swl     $s2,8+$MSB($out)
+       swl     $s3,12+$MSB($out)
+
+       .set    noreorder
+       $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+       $REG_L  $s11,$FRAMESIZE-3*$SZREG($sp)
+       $REG_L  $s10,$FRAMESIZE-4*$SZREG($sp)
+       $REG_L  $s9,$FRAMESIZE-5*$SZREG($sp)
+       $REG_L  $s8,$FRAMESIZE-6*$SZREG($sp)
+       $REG_L  $s7,$FRAMESIZE-7*$SZREG($sp)
+       $REG_L  $s6,$FRAMESIZE-8*$SZREG($sp)
+       $REG_L  $s5,$FRAMESIZE-9*$SZREG($sp)
+       $REG_L  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  \$15,$FRAMESIZE-11*$SZREG($sp)
+       $REG_L  \$14,$FRAMESIZE-12*$SZREG($sp)
+       $REG_L  \$13,$FRAMESIZE-13*$SZREG($sp)
+       $REG_L  \$12,$FRAMESIZE-14*$SZREG($sp)
+       $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+       jr      $ra
+       $PTR_ADD $sp,$FRAMESIZE
+.end   AES_encrypt
+___
+\f
+$code.=<<___;
+.align 5
+.ent   _mips_AES_decrypt
+_mips_AES_decrypt:
+       .frame  $sp,0,$ra
+       .set    reorder
+       lw      $t0,0($key)
+       lw      $t1,4($key)
+       lw      $t2,8($key)
+       lw      $t3,12($key)
+       lw      $cnt,240($key)
+       $PTR_ADD $key0,$key,16
+
+       xor     $s0,$t0
+       xor     $s1,$t1
+       xor     $s2,$t2
+       xor     $s3,$t3
+
+       sub     $cnt,1
+       _xtr    $i0,$s3,16-2
+.Loop_dec:
+       _xtr    $i1,$s0,16-2
+       _xtr    $i2,$s1,16-2
+       _xtr    $i3,$s2,16-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lwl     $t0,3($i0)              # Td1[s3>>16]
+       lwl     $t1,3($i1)              # Td1[s0>>16]
+       lwl     $t2,3($i2)              # Td1[s1>>16]
+       lwl     $t3,3($i3)              # Td1[s2>>16]
+       lwr     $t0,2($i0)              # Td1[s3>>16]
+       lwr     $t1,2($i1)              # Td1[s0>>16]
+       lwr     $t2,2($i2)              # Td1[s1>>16]
+       lwr     $t3,2($i3)              # Td1[s2>>16]
+
+       _xtr    $i0,$s2,8-2
+       _xtr    $i1,$s3,8-2
+       _xtr    $i2,$s0,8-2
+       _xtr    $i3,$s1,8-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lwl     $t4,2($i0)              # Td2[s2>>8]
+       lwl     $t5,2($i1)              # Td2[s3>>8]
+       lwl     $t6,2($i2)              # Td2[s0>>8]
+       lwl     $t7,2($i3)              # Td2[s1>>8]
+       lwr     $t4,1($i0)              # Td2[s2>>8]
+       lwr     $t5,1($i1)              # Td2[s3>>8]
+       lwr     $t6,1($i2)              # Td2[s0>>8]
+       lwr     $t7,1($i3)              # Td2[s1>>8]
+
+       _xtr    $i0,$s1,0-2
+       _xtr    $i1,$s2,0-2
+       _xtr    $i2,$s3,0-2
+       _xtr    $i3,$s0,0-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lwl     $t8,1($i0)              # Td3[s1]
+       lwl     $t9,1($i1)              # Td3[s2]
+       lwl     $t10,1($i2)             # Td3[s3]
+       lwl     $t11,1($i3)             # Td3[s0]
+       lwr     $t8,0($i0)              # Td3[s1]
+       lwr     $t9,0($i1)              # Td3[s2]
+       lwr     $t10,0($i2)             # Td3[s3]
+       lwr     $t11,0($i3)             # Td3[s0]
+
+       _xtr    $i0,$s0,24-2
+       _xtr    $i1,$s1,24-2
+       _xtr    $i2,$s2,24-2
+       _xtr    $i3,$s3,24-2
+       and     $i0,0x3fc
+       and     $i1,0x3fc
+       and     $i2,0x3fc
+       and     $i3,0x3fc
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+
+
+       lw      $t4,0($i0)              # Td0[s0>>24]
+       lw      $t5,0($i1)              # Td0[s1>>24]
+       lw      $t6,0($i2)              # Td0[s2>>24]
+       lw      $t7,0($i3)              # Td0[s3>>24]
+
+       lw      $s0,0($key0)
+       lw      $s1,4($key0)
+       lw      $s2,8($key0)
+       lw      $s3,12($key0)
+
+       xor     $t0,$t8
+       xor     $t1,$t9
+       xor     $t2,$t10
+       xor     $t3,$t11
+
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+
+       sub     $cnt,1
+       $PTR_ADD $key0,16
+       xor     $s0,$t0
+       xor     $s1,$t1
+       xor     $s2,$t2
+       xor     $s3,$t3
+       .set    noreorder
+       bnez    $cnt,.Loop_dec
+       _xtr    $i0,$s3,16-2
+
+       .set    reorder
+       lw      $t4,1024($Tbl)          # prefetch Td4
+       lw      $t5,1024+32($Tbl)
+       lw      $t6,1024+64($Tbl)
+       lw      $t7,1024+96($Tbl)
+       lw      $t8,1024+128($Tbl)
+       lw      $t9,1024+160($Tbl)
+       lw      $t10,1024+192($Tbl)
+       lw      $t11,1024+224($Tbl)
+
+       _xtr    $i0,$s3,16
+       _xtr    $i1,$s0,16
+       _xtr    $i2,$s1,16
+       _xtr    $i3,$s2,16
+       and     $i0,0xff
+       and     $i1,0xff
+       and     $i2,0xff
+       and     $i3,0xff
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t0,1024($i0)           # Td4[s3>>16]
+       lbu     $t1,1024($i1)           # Td4[s0>>16]
+       lbu     $t2,1024($i2)           # Td4[s1>>16]
+       lbu     $t3,1024($i3)           # Td4[s2>>16]
+
+       _xtr    $i0,$s2,8
+       _xtr    $i1,$s3,8
+       _xtr    $i2,$s0,8
+       _xtr    $i3,$s1,8
+       and     $i0,0xff
+       and     $i1,0xff
+       and     $i2,0xff
+       and     $i3,0xff
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t4,1024($i0)           # Td4[s2>>8]
+       lbu     $t5,1024($i1)           # Td4[s3>>8]
+       lbu     $t6,1024($i2)           # Td4[s0>>8]
+       lbu     $t7,1024($i3)           # Td4[s1>>8]
+
+       _xtr    $i0,$s0,24
+       _xtr    $i1,$s1,24
+       _xtr    $i2,$s2,24
+       _xtr    $i3,$s3,24
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t8,1024($i0)           # Td4[s0>>24]
+       lbu     $t9,1024($i1)           # Td4[s1>>24]
+       lbu     $t10,1024($i2)          # Td4[s2>>24]
+       lbu     $t11,1024($i3)          # Td4[s3>>24]
+
+       _xtr    $i0,$s1,0
+       _xtr    $i1,$s2,0
+       _xtr    $i2,$s3,0
+       _xtr    $i3,$s0,0
+
+       _ins    $t0,16
+       _ins    $t1,16
+       _ins    $t2,16
+       _ins    $t3,16
+
+       _ins    $t4,8
+       _ins    $t5,8
+       _ins    $t6,8
+       _ins    $t7,8
+
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $t4,1024($i0)           # Td4[s1]
+       lbu     $t5,1024($i1)           # Td4[s2]
+       lbu     $t6,1024($i2)           # Td4[s3]
+       lbu     $t7,1024($i3)           # Td4[s0]
+
+       _ins    $t8,24
+       _ins    $t9,24
+       _ins    $t10,24
+       _ins    $t11,24
+
+       lw      $s0,0($key0)
+       lw      $s1,4($key0)
+       lw      $s2,8($key0)
+       lw      $s3,12($key0)
+
+       _ins    $t4,0
+       _ins    $t5,0
+       _ins    $t6,0
+       _ins    $t7,0
+
+
+       xor     $t0,$t8
+       xor     $t1,$t9
+       xor     $t2,$t10
+       xor     $t3,$t11
+
+       xor     $t0,$t4
+       xor     $t1,$t5
+       xor     $t2,$t6
+       xor     $t3,$t7
+
+       xor     $s0,$t0
+       xor     $s1,$t1
+       xor     $s2,$t2
+       xor     $s3,$t3
+
+       jr      $ra
+.end   _mips_AES_decrypt
+
+.align 5
+.globl AES_decrypt
+.ent   AES_decrypt
+AES_decrypt:
+       .frame  $sp,$FRAMESIZE,$ra
+       .mask   $SAVED_REGS_MASK,-$SZREG
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);  # o32 PIC-ification
+       .cpload $pf
+___
+$code.=<<___;
+       $PTR_SUB $sp,$FRAMESIZE
+       $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+       $REG_S  $s11,$FRAMESIZE-3*$SZREG($sp)
+       $REG_S  $s10,$FRAMESIZE-4*$SZREG($sp)
+       $REG_S  $s9,$FRAMESIZE-5*$SZREG($sp)
+       $REG_S  $s8,$FRAMESIZE-6*$SZREG($sp)
+       $REG_S  $s7,$FRAMESIZE-7*$SZREG($sp)
+       $REG_S  $s6,$FRAMESIZE-8*$SZREG($sp)
+       $REG_S  $s5,$FRAMESIZE-9*$SZREG($sp)
+       $REG_S  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+       $REG_S  \$15,$FRAMESIZE-11*$SZREG($sp)
+       $REG_S  \$14,$FRAMESIZE-12*$SZREG($sp)
+       $REG_S  \$13,$FRAMESIZE-13*$SZREG($sp)
+       $REG_S  \$12,$FRAMESIZE-14*$SZREG($sp)
+       $REG_S  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);  # non-o32 PIC-ification
+       .cplocal        $Tbl
+       .cpsetup        $pf,$zero,AES_decrypt
+___
+$code.=<<___;
+       .set    reorder
+       la      $Tbl,AES_Td             # PIC-ified 'load address'
+
+       lwl     $s0,0+$MSB($inp)
+       lwl     $s1,4+$MSB($inp)
+       lwl     $s2,8+$MSB($inp)
+       lwl     $s3,12+$MSB($inp)
+       lwr     $s0,0+$LSB($inp)
+       lwr     $s1,4+$LSB($inp)
+       lwr     $s2,8+$LSB($inp)
+       lwr     $s3,12+$LSB($inp)
+
+       bal     _mips_AES_decrypt
+
+       swr     $s0,0+$LSB($out)
+       swr     $s1,4+$LSB($out)
+       swr     $s2,8+$LSB($out)
+       swr     $s3,12+$LSB($out)
+       swl     $s0,0+$MSB($out)
+       swl     $s1,4+$MSB($out)
+       swl     $s2,8+$MSB($out)
+       swl     $s3,12+$MSB($out)
+
+       .set    noreorder
+       $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+       $REG_L  $s11,$FRAMESIZE-3*$SZREG($sp)
+       $REG_L  $s10,$FRAMESIZE-4*$SZREG($sp)
+       $REG_L  $s9,$FRAMESIZE-5*$SZREG($sp)
+       $REG_L  $s8,$FRAMESIZE-6*$SZREG($sp)
+       $REG_L  $s7,$FRAMESIZE-7*$SZREG($sp)
+       $REG_L  $s6,$FRAMESIZE-8*$SZREG($sp)
+       $REG_L  $s5,$FRAMESIZE-9*$SZREG($sp)
+       $REG_L  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  \$15,$FRAMESIZE-11*$SZREG($sp)
+       $REG_L  \$14,$FRAMESIZE-12*$SZREG($sp)
+       $REG_L  \$13,$FRAMESIZE-13*$SZREG($sp)
+       $REG_L  \$12,$FRAMESIZE-14*$SZREG($sp)
+       $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+       jr      $ra
+       $PTR_ADD $sp,$FRAMESIZE
+.end   AES_decrypt
+___
+}}}
+\f
+{{{
+my $FRAMESIZE=8*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
+
+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($rcon,$cnt)=($gp,$fp);
+
+$code.=<<___;
+.align 5
+.ent   _mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+       .frame  $sp,0,$ra
+       .set    noreorder
+       beqz    $inp,.Lekey_done
+       li      $t0,-1
+       beqz    $key,.Lekey_done
+       $PTR_ADD $rcon,$Tbl,1024+256
+
+       .set    reorder
+       lwl     $rk0,0+$MSB($inp)       # load 128 bits
+       lwl     $rk1,4+$MSB($inp)
+       lwl     $rk2,8+$MSB($inp)
+       lwl     $rk3,12+$MSB($inp)
+       li      $at,128
+       lwr     $rk0,0+$LSB($inp)
+       lwr     $rk1,4+$LSB($inp)
+       lwr     $rk2,8+$LSB($inp)
+       lwr     $rk3,12+$LSB($inp)
+       .set    noreorder
+       beq     $bits,$at,.L128bits
+       li      $cnt,10
+
+       .set    reorder
+       lwl     $rk4,16+$MSB($inp)      # load 192 bits
+       lwl     $rk5,20+$MSB($inp)
+       li      $at,192
+       lwr     $rk4,16+$LSB($inp)
+       lwr     $rk5,20+$LSB($inp)
+       .set    noreorder
+       beq     $bits,$at,.L192bits
+       li      $cnt,8
+
+       .set    reorder
+       lwl     $rk6,24+$MSB($inp)      # load 256 bits
+       lwl     $rk7,28+$MSB($inp)
+       li      $at,256
+       lwr     $rk6,24+$LSB($inp)
+       lwr     $rk7,28+$LSB($inp)
+       .set    noreorder
+       beq     $bits,$at,.L256bits
+       li      $cnt,7
+
+       b       .Lekey_done
+       li      $t0,-2
+
+.align 4
+.L128bits:
+       .set    reorder
+       srl     $i0,$rk3,16
+       srl     $i1,$rk3,8
+       and     $i0,0xff
+       and     $i1,0xff
+       and     $i2,$rk3,0xff
+       srl     $i3,$rk3,24
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $i0,1024($i0)
+       lbu     $i1,1024($i1)
+       lbu     $i2,1024($i2)
+       lbu     $i3,1024($i3)
+
+       sw      $rk0,0($key)
+       sw      $rk1,4($key)
+       sw      $rk2,8($key)
+       sw      $rk3,12($key)
+       sub     $cnt,1
+       $PTR_ADD $key,16
+
+       _bias   $i0,24
+       _bias   $i1,16
+       _bias   $i2,8
+       _bias   $i3,0
+
+       xor     $rk0,$i0
+       lw      $i0,0($rcon)
+       xor     $rk0,$i1
+       xor     $rk0,$i2
+       xor     $rk0,$i3
+       xor     $rk0,$i0
+
+       xor     $rk1,$rk0
+       xor     $rk2,$rk1
+       xor     $rk3,$rk2
+
+       .set    noreorder
+       bnez    $cnt,.L128bits
+       $PTR_ADD $rcon,4
+
+       sw      $rk0,0($key)
+       sw      $rk1,4($key)
+       sw      $rk2,8($key)
+       li      $cnt,10
+       sw      $rk3,12($key)
+       li      $t0,0
+       sw      $cnt,80($key)
+       b       .Lekey_done
+       $PTR_SUB $key,10*16
+
+.align 4
+.L192bits:
+       .set    reorder
+       srl     $i0,$rk5,16
+       srl     $i1,$rk5,8
+       and     $i0,0xff
+       and     $i1,0xff
+       and     $i2,$rk5,0xff
+       srl     $i3,$rk5,24
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $i0,1024($i0)
+       lbu     $i1,1024($i1)
+       lbu     $i2,1024($i2)
+       lbu     $i3,1024($i3)
+
+       sw      $rk0,0($key)
+       sw      $rk1,4($key)
+       sw      $rk2,8($key)
+       sw      $rk3,12($key)
+       sw      $rk4,16($key)
+       sw      $rk5,20($key)
+       sub     $cnt,1
+       $PTR_ADD $key,24
+
+       _bias   $i0,24
+       _bias   $i1,16
+       _bias   $i2,8
+       _bias   $i3,0
+
+       xor     $rk0,$i0
+       lw      $i0,0($rcon)
+       xor     $rk0,$i1
+       xor     $rk0,$i2
+       xor     $rk0,$i3
+       xor     $rk0,$i0
+
+       xor     $rk1,$rk0
+       xor     $rk2,$rk1
+       xor     $rk3,$rk2
+       xor     $rk4,$rk3
+       xor     $rk5,$rk4
+
+       .set    noreorder
+       bnez    $cnt,.L192bits
+       $PTR_ADD $rcon,4
+
+       sw      $rk0,0($key)
+       sw      $rk1,4($key)
+       sw      $rk2,8($key)
+       li      $cnt,12
+       sw      $rk3,12($key)
+       li      $t0,0
+       sw      $cnt,48($key)
+       b       .Lekey_done
+       $PTR_SUB $key,12*16
+
+.align 4
+.L256bits:
+       .set    reorder
+       srl     $i0,$rk7,16
+       srl     $i1,$rk7,8
+       and     $i0,0xff
+       and     $i1,0xff
+       and     $i2,$rk7,0xff
+       srl     $i3,$rk7,24
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $i0,1024($i0)
+       lbu     $i1,1024($i1)
+       lbu     $i2,1024($i2)
+       lbu     $i3,1024($i3)
+
+       sw      $rk0,0($key)
+       sw      $rk1,4($key)
+       sw      $rk2,8($key)
+       sw      $rk3,12($key)
+       sw      $rk4,16($key)
+       sw      $rk5,20($key)
+       sw      $rk6,24($key)
+       sw      $rk7,28($key)
+       sub     $cnt,1
+
+       _bias   $i0,24
+       _bias   $i1,16
+       _bias   $i2,8
+       _bias   $i3,0
+
+       xor     $rk0,$i0
+       lw      $i0,0($rcon)
+       xor     $rk0,$i1
+       xor     $rk0,$i2
+       xor     $rk0,$i3
+       xor     $rk0,$i0
+
+       xor     $rk1,$rk0
+       xor     $rk2,$rk1
+       xor     $rk3,$rk2
+       beqz    $cnt,.L256bits_done
+
+       srl     $i0,$rk3,24
+       srl     $i1,$rk3,16
+       srl     $i2,$rk3,8
+       and     $i3,$rk3,0xff
+       and     $i1,0xff
+       and     $i2,0xff
+       $PTR_ADD $i0,$Tbl
+       $PTR_ADD $i1,$Tbl
+       $PTR_ADD $i2,$Tbl
+       $PTR_ADD $i3,$Tbl
+       lbu     $i0,1024($i0)
+       lbu     $i1,1024($i1)
+       lbu     $i2,1024($i2)
+       lbu     $i3,1024($i3)
+       sll     $i0,24
+       sll     $i1,16
+       sll     $i2,8
+
+       xor     $rk4,$i0
+       xor     $rk4,$i1
+       xor     $rk4,$i2
+       xor     $rk4,$i3
+
+       xor     $rk5,$rk4
+       xor     $rk6,$rk5
+       xor     $rk7,$rk6
+
+       $PTR_ADD $key,32
+       .set    noreorder
+       b       .L256bits
+       $PTR_ADD $rcon,4
+
+.L256bits_done:
+       sw      $rk0,32($key)
+       sw      $rk1,36($key)
+       sw      $rk2,40($key)
+       li      $cnt,14
+       sw      $rk3,44($key)
+       li      $t0,0
+       sw      $cnt,48($key)
+       $PTR_SUB $key,12*16
+
+.Lekey_done:
+       jr      $ra
+       nop
+.end   _mips_AES_set_encrypt_key
+
+.globl AES_set_encrypt_key
+.ent   AES_set_encrypt_key
+AES_set_encrypt_key:
+       .frame  $sp,$FRAMESIZE,$ra
+       .mask   $SAVED_REGS_MASK,-$SZREG
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);  # o32 PIC-ification
+       .cpload $pf
+___
+$code.=<<___;
+       $PTR_SUB $sp,$FRAMESIZE
+       $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+       $REG_S  $s3,$FRAMESIZE-3*$SZREG($sp)
+       $REG_S  $s2,$FRAMESIZE-4*$SZREG($sp)
+       $REG_S  $s1,$FRAMESIZE-5*$SZREG($sp)
+       $REG_S  $s0,$FRAMESIZE-6*$SZREG($sp)
+       $REG_S  $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);  # non-o32 PIC-ification
+       .cplocal        $Tbl
+       .cpsetup        $pf,$zero,AES_set_encrypt_key
+___
+$code.=<<___;
+       .set    reorder
+       la      $Tbl,AES_Te             # PIC-ified 'load address'
+
+       bal     _mips_AES_set_encrypt_key
+
+       .set    noreorder
+       move    $a0,$t0
+       $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $s3,$FRAMESIZE-11*$SZREG($sp)
+       $REG_L  $s2,$FRAMESIZE-12*$SZREG($sp)
+       $REG_L  $s1,$FRAMESIZE-13*$SZREG($sp)
+       $REG_L  $s0,$FRAMESIZE-14*$SZREG($sp)
+       $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+       jr      $ra
+       $PTR_ADD $sp,$FRAMESIZE
+.end   AES_set_encrypt_key
+___
+\f
+my ($head,$tail)=($inp,$bits);
+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
+$code.=<<___;
+.align 5
+.globl AES_set_decrypt_key
+.ent   AES_set_decrypt_key
+AES_set_decrypt_key:
+       .frame  $sp,$FRAMESIZE,$ra
+       .mask   $SAVED_REGS_MASK,-$SZREG
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);  # o32 PIC-ification
+       .cpload $pf
+___
+$code.=<<___;
+       $PTR_SUB $sp,$FRAMESIZE
+       $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+       $REG_S  $s3,$FRAMESIZE-3*$SZREG($sp)
+       $REG_S  $s2,$FRAMESIZE-4*$SZREG($sp)
+       $REG_S  $s1,$FRAMESIZE-5*$SZREG($sp)
+       $REG_S  $s0,$FRAMESIZE-6*$SZREG($sp)
+       $REG_S  $gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);  # non-o32 PIC-ification
+       .cplocal        $Tbl
+       .cpsetup        $pf,$zero,AES_set_decrypt_key
+___
+$code.=<<___;
+       .set    reorder
+       la      $Tbl,AES_Te             # PIC-ified 'load address'
+
+       bal     _mips_AES_set_encrypt_key
+
+       bltz    $t0,.Ldkey_done
+
+       sll     $at,$cnt,4
+       $PTR_ADD $head,$key,0
+       $PTR_ADD $tail,$key,$at
+.align 4
+.Lswap:
+       lw      $rk0,0($head)
+       lw      $rk1,4($head)
+       lw      $rk2,8($head)
+       lw      $rk3,12($head)
+       lw      $rk4,0($tail)
+       lw      $rk5,4($tail)
+       lw      $rk6,8($tail)
+       lw      $rk7,12($tail)
+       sw      $rk0,0($tail)
+       sw      $rk1,4($tail)
+       sw      $rk2,8($tail)
+       sw      $rk3,12($tail)
+       $PTR_ADD $head,16
+       $PTR_SUB $tail,16
+       sw      $rk4,-16($head)
+       sw      $rk5,-12($head)
+       sw      $rk6,-8($head)
+       sw      $rk7,-4($head)
+       bne     $head,$tail,.Lswap
+
+       lw      $tp1,16($key)           # modulo-scheduled
+       lui     $x80808080,0x8080
+       sub     $cnt,1
+       or      $x80808080,0x8080
+       sll     $cnt,2
+       $PTR_ADD $key,16
+       lui     $x1b1b1b1b,0x1b1b
+       nor     $x7f7f7f7f,$zero,$x80808080
+       or      $x1b1b1b1b,0x1b1b
+.align 4
+.Lmix:
+       and     $m,$tp1,$x80808080
+       and     $tp2,$tp1,$x7f7f7f7f
+       srl     $tp4,$m,7
+       addu    $tp2,$tp2               # tp2<<1
+       subu    $m,$tp4
+       and     $m,$x1b1b1b1b
+       xor     $tp2,$m
+
+       and     $m,$tp2,$x80808080
+       and     $tp4,$tp2,$x7f7f7f7f
+       srl     $tp8,$m,7
+       addu    $tp4,$tp4               # tp4<<1
+       subu    $m,$tp8
+       and     $m,$x1b1b1b1b
+       xor     $tp4,$m
+
+       and     $m,$tp4,$x80808080
+       and     $tp8,$tp4,$x7f7f7f7f
+       srl     $tp9,$m,7
+       addu    $tp8,$tp8               # tp8<<1
+       subu    $m,$tp9
+       and     $m,$x1b1b1b1b
+       xor     $tp8,$m
+
+       xor     $tp9,$tp8,$tp1
+       xor     $tpe,$tp8,$tp4
+       xor     $tpb,$tp9,$tp2
+       xor     $tpd,$tp9,$tp4
+
+       _ror    $tp1,$tpd,16
+        xor    $tpe,$tp2
+       _ror    $tp2,$tpd,-16
+       xor     $tpe,$tp1
+       _ror    $tp1,$tp9,8
+       xor     $tpe,$tp2
+       _ror    $tp2,$tp9,-24
+       xor     $tpe,$tp1
+       _ror    $tp1,$tpb,24
+       xor     $tpe,$tp2
+       _ror    $tp2,$tpb,-8
+       xor     $tpe,$tp1
+       lw      $tp1,4($key)            # modulo-scheduled
+       xor     $tpe,$tp2
+       sub     $cnt,1
+       sw      $tpe,0($key)
+       $PTR_ADD $key,4
+       bnez    $cnt,.Lmix
+
+       li      $t0,0
+.Ldkey_done:
+       .set    noreorder
+       move    $a0,$t0
+       $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $s3,$FRAMESIZE-11*$SZREG($sp)
+       $REG_L  $s2,$FRAMESIZE-12*$SZREG($sp)
+       $REG_L  $s1,$FRAMESIZE-13*$SZREG($sp)
+       $REG_L  $s0,$FRAMESIZE-14*$SZREG($sp)
+       $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+       jr      $ra
+       $PTR_ADD $sp,$FRAMESIZE
+.end   AES_set_decrypt_key
+___
+}}}
+
+######################################################################
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+.rdata
+.align 6
+AES_Te:
+.byte  0xc6,0x63,0x63,0xa5,    0xf8,0x7c,0x7c,0x84     # Te0
+.byte  0xee,0x77,0x77,0x99,    0xf6,0x7b,0x7b,0x8d
+.byte  0xff,0xf2,0xf2,0x0d,    0xd6,0x6b,0x6b,0xbd
+.byte  0xde,0x6f,0x6f,0xb1,    0x91,0xc5,0xc5,0x54
+.byte  0x60,0x30,0x30,0x50,    0x02,0x01,0x01,0x03
+.byte  0xce,0x67,0x67,0xa9,    0x56,0x2b,0x2b,0x7d
+.byte  0xe7,0xfe,0xfe,0x19,    0xb5,0xd7,0xd7,0x62
+.byte  0x4d,0xab,0xab,0xe6,    0xec,0x76,0x76,0x9a
+.byte  0x8f,0xca,0xca,0x45,    0x1f,0x82,0x82,0x9d
+.byte  0x89,0xc9,0xc9,0x40,    0xfa,0x7d,0x7d,0x87
+.byte  0xef,0xfa,0xfa,0x15,    0xb2,0x59,0x59,0xeb
+.byte  0x8e,0x47,0x47,0xc9,    0xfb,0xf0,0xf0,0x0b
+.byte  0x41,0xad,0xad,0xec,    0xb3,0xd4,0xd4,0x67
+.byte  0x5f,0xa2,0xa2,0xfd,    0x45,0xaf,0xaf,0xea
+.byte  0x23,0x9c,0x9c,0xbf,    0x53,0xa4,0xa4,0xf7
+.byte  0xe4,0x72,0x72,0x96,    0x9b,0xc0,0xc0,0x5b
+.byte  0x75,0xb7,0xb7,0xc2,    0xe1,0xfd,0xfd,0x1c
+.byte  0x3d,0x93,0x93,0xae,    0x4c,0x26,0x26,0x6a
+.byte  0x6c,0x36,0x36,0x5a,    0x7e,0x3f,0x3f,0x41
+.byte  0xf5,0xf7,0xf7,0x02,    0x83,0xcc,0xcc,0x4f
+.byte  0x68,0x34,0x34,0x5c,    0x51,0xa5,0xa5,0xf4
+.byte  0xd1,0xe5,0xe5,0x34,    0xf9,0xf1,0xf1,0x08
+.byte  0xe2,0x71,0x71,0x93,    0xab,0xd8,0xd8,0x73
+.byte  0x62,0x31,0x31,0x53,    0x2a,0x15,0x15,0x3f
+.byte  0x08,0x04,0x04,0x0c,    0x95,0xc7,0xc7,0x52
+.byte  0x46,0x23,0x23,0x65,    0x9d,0xc3,0xc3,0x5e
+.byte  0x30,0x18,0x18,0x28,    0x37,0x96,0x96,0xa1
+.byte  0x0a,0x05,0x05,0x0f,    0x2f,0x9a,0x9a,0xb5
+.byte  0x0e,0x07,0x07,0x09,    0x24,0x12,0x12,0x36
+.byte  0x1b,0x80,0x80,0x9b,    0xdf,0xe2,0xe2,0x3d
+.byte  0xcd,0xeb,0xeb,0x26,    0x4e,0x27,0x27,0x69
+.byte  0x7f,0xb2,0xb2,0xcd,    0xea,0x75,0x75,0x9f
+.byte  0x12,0x09,0x09,0x1b,    0x1d,0x83,0x83,0x9e
+.byte  0x58,0x2c,0x2c,0x74,    0x34,0x1a,0x1a,0x2e
+.byte  0x36,0x1b,0x1b,0x2d,    0xdc,0x6e,0x6e,0xb2
+.byte  0xb4,0x5a,0x5a,0xee,    0x5b,0xa0,0xa0,0xfb
+.byte  0xa4,0x52,0x52,0xf6,    0x76,0x3b,0x3b,0x4d
+.byte  0xb7,0xd6,0xd6,0x61,    0x7d,0xb3,0xb3,0xce
+.byte  0x52,0x29,0x29,0x7b,    0xdd,0xe3,0xe3,0x3e
+.byte  0x5e,0x2f,0x2f,0x71,    0x13,0x84,0x84,0x97
+.byte  0xa6,0x53,0x53,0xf5,    0xb9,0xd1,0xd1,0x68
+.byte  0x00,0x00,0x00,0x00,    0xc1,0xed,0xed,0x2c
+.byte  0x40,0x20,0x20,0x60,    0xe3,0xfc,0xfc,0x1f
+.byte  0x79,0xb1,0xb1,0xc8,    0xb6,0x5b,0x5b,0xed
+.byte  0xd4,0x6a,0x6a,0xbe,    0x8d,0xcb,0xcb,0x46
+.byte  0x67,0xbe,0xbe,0xd9,    0x72,0x39,0x39,0x4b
+.byte  0x94,0x4a,0x4a,0xde,    0x98,0x4c,0x4c,0xd4
+.byte  0xb0,0x58,0x58,0xe8,    0x85,0xcf,0xcf,0x4a
+.byte  0xbb,0xd0,0xd0,0x6b,    0xc5,0xef,0xef,0x2a
+.byte  0x4f,0xaa,0xaa,0xe5,    0xed,0xfb,0xfb,0x16
+.byte  0x86,0x43,0x43,0xc5,    0x9a,0x4d,0x4d,0xd7
+.byte  0x66,0x33,0x33,0x55,    0x11,0x85,0x85,0x94
+.byte  0x8a,0x45,0x45,0xcf,    0xe9,0xf9,0xf9,0x10
+.byte  0x04,0x02,0x02,0x06,    0xfe,0x7f,0x7f,0x81
+.byte  0xa0,0x50,0x50,0xf0,    0x78,0x3c,0x3c,0x44
+.byte  0x25,0x9f,0x9f,0xba,    0x4b,0xa8,0xa8,0xe3
+.byte  0xa2,0x51,0x51,0xf3,    0x5d,0xa3,0xa3,0xfe
+.byte  0x80,0x40,0x40,0xc0,    0x05,0x8f,0x8f,0x8a
+.byte  0x3f,0x92,0x92,0xad,    0x21,0x9d,0x9d,0xbc
+.byte  0x70,0x38,0x38,0x48,    0xf1,0xf5,0xf5,0x04
+.byte  0x63,0xbc,0xbc,0xdf,    0x77,0xb6,0xb6,0xc1
+.byte  0xaf,0xda,0xda,0x75,    0x42,0x21,0x21,0x63
+.byte  0x20,0x10,0x10,0x30,    0xe5,0xff,0xff,0x1a
+.byte  0xfd,0xf3,0xf3,0x0e,    0xbf,0xd2,0xd2,0x6d
+.byte  0x81,0xcd,0xcd,0x4c,    0x18,0x0c,0x0c,0x14
+.byte  0x26,0x13,0x13,0x35,    0xc3,0xec,0xec,0x2f
+.byte  0xbe,0x5f,0x5f,0xe1,    0x35,0x97,0x97,0xa2
+.byte  0x88,0x44,0x44,0xcc,    0x2e,0x17,0x17,0x39
+.byte  0x93,0xc4,0xc4,0x57,    0x55,0xa7,0xa7,0xf2
+.byte  0xfc,0x7e,0x7e,0x82,    0x7a,0x3d,0x3d,0x47
+.byte  0xc8,0x64,0x64,0xac,    0xba,0x5d,0x5d,0xe7
+.byte  0x32,0x19,0x19,0x2b,    0xe6,0x73,0x73,0x95
+.byte  0xc0,0x60,0x60,0xa0,    0x19,0x81,0x81,0x98
+.byte  0x9e,0x4f,0x4f,0xd1,    0xa3,0xdc,0xdc,0x7f
+.byte  0x44,0x22,0x22,0x66,    0x54,0x2a,0x2a,0x7e
+.byte  0x3b,0x90,0x90,0xab,    0x0b,0x88,0x88,0x83
+.byte  0x8c,0x46,0x46,0xca,    0xc7,0xee,0xee,0x29
+.byte  0x6b,0xb8,0xb8,0xd3,    0x28,0x14,0x14,0x3c
+.byte  0xa7,0xde,0xde,0x79,    0xbc,0x5e,0x5e,0xe2
+.byte  0x16,0x0b,0x0b,0x1d,    0xad,0xdb,0xdb,0x76
+.byte  0xdb,0xe0,0xe0,0x3b,    0x64,0x32,0x32,0x56
+.byte  0x74,0x3a,0x3a,0x4e,    0x14,0x0a,0x0a,0x1e
+.byte  0x92,0x49,0x49,0xdb,    0x0c,0x06,0x06,0x0a
+.byte  0x48,0x24,0x24,0x6c,    0xb8,0x5c,0x5c,0xe4
+.byte  0x9f,0xc2,0xc2,0x5d,    0xbd,0xd3,0xd3,0x6e
+.byte  0x43,0xac,0xac,0xef,    0xc4,0x62,0x62,0xa6
+.byte  0x39,0x91,0x91,0xa8,    0x31,0x95,0x95,0xa4
+.byte  0xd3,0xe4,0xe4,0x37,    0xf2,0x79,0x79,0x8b
+.byte  0xd5,0xe7,0xe7,0x32,    0x8b,0xc8,0xc8,0x43
+.byte  0x6e,0x37,0x37,0x59,    0xda,0x6d,0x6d,0xb7
+.byte  0x01,0x8d,0x8d,0x8c,    0xb1,0xd5,0xd5,0x64
+.byte  0x9c,0x4e,0x4e,0xd2,    0x49,0xa9,0xa9,0xe0
+.byte  0xd8,0x6c,0x6c,0xb4,    0xac,0x56,0x56,0xfa
+.byte  0xf3,0xf4,0xf4,0x07,    0xcf,0xea,0xea,0x25
+.byte  0xca,0x65,0x65,0xaf,    0xf4,0x7a,0x7a,0x8e
+.byte  0x47,0xae,0xae,0xe9,    0x10,0x08,0x08,0x18
+.byte  0x6f,0xba,0xba,0xd5,    0xf0,0x78,0x78,0x88
+.byte  0x4a,0x25,0x25,0x6f,    0x5c,0x2e,0x2e,0x72
+.byte  0x38,0x1c,0x1c,0x24,    0x57,0xa6,0xa6,0xf1
+.byte  0x73,0xb4,0xb4,0xc7,    0x97,0xc6,0xc6,0x51
+.byte  0xcb,0xe8,0xe8,0x23,    0xa1,0xdd,0xdd,0x7c
+.byte  0xe8,0x74,0x74,0x9c,    0x3e,0x1f,0x1f,0x21
+.byte  0x96,0x4b,0x4b,0xdd,    0x61,0xbd,0xbd,0xdc
+.byte  0x0d,0x8b,0x8b,0x86,    0x0f,0x8a,0x8a,0x85
+.byte  0xe0,0x70,0x70,0x90,    0x7c,0x3e,0x3e,0x42
+.byte  0x71,0xb5,0xb5,0xc4,    0xcc,0x66,0x66,0xaa
+.byte  0x90,0x48,0x48,0xd8,    0x06,0x03,0x03,0x05
+.byte  0xf7,0xf6,0xf6,0x01,    0x1c,0x0e,0x0e,0x12
+.byte  0xc2,0x61,0x61,0xa3,    0x6a,0x35,0x35,0x5f
+.byte  0xae,0x57,0x57,0xf9,    0x69,0xb9,0xb9,0xd0
+.byte  0x17,0x86,0x86,0x91,    0x99,0xc1,0xc1,0x58
+.byte  0x3a,0x1d,0x1d,0x27,    0x27,0x9e,0x9e,0xb9
+.byte  0xd9,0xe1,0xe1,0x38,    0xeb,0xf8,0xf8,0x13
+.byte  0x2b,0x98,0x98,0xb3,    0x22,0x11,0x11,0x33
+.byte  0xd2,0x69,0x69,0xbb,    0xa9,0xd9,0xd9,0x70
+.byte  0x07,0x8e,0x8e,0x89,    0x33,0x94,0x94,0xa7
+.byte  0x2d,0x9b,0x9b,0xb6,    0x3c,0x1e,0x1e,0x22
+.byte  0x15,0x87,0x87,0x92,    0xc9,0xe9,0xe9,0x20
+.byte  0x87,0xce,0xce,0x49,    0xaa,0x55,0x55,0xff
+.byte  0x50,0x28,0x28,0x78,    0xa5,0xdf,0xdf,0x7a
+.byte  0x03,0x8c,0x8c,0x8f,    0x59,0xa1,0xa1,0xf8
+.byte  0x09,0x89,0x89,0x80,    0x1a,0x0d,0x0d,0x17
+.byte  0x65,0xbf,0xbf,0xda,    0xd7,0xe6,0xe6,0x31
+.byte  0x84,0x42,0x42,0xc6,    0xd0,0x68,0x68,0xb8
+.byte  0x82,0x41,0x41,0xc3,    0x29,0x99,0x99,0xb0
+.byte  0x5a,0x2d,0x2d,0x77,    0x1e,0x0f,0x0f,0x11
+.byte  0x7b,0xb0,0xb0,0xcb,    0xa8,0x54,0x54,0xfc
+.byte  0x6d,0xbb,0xbb,0xd6,    0x2c,0x16,0x16,0x3a
+
+.byte  0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5  # Te4
+.byte  0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte  0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte  0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte  0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte  0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte  0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte  0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte  0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte  0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte  0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte  0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte  0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte  0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte  0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte  0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte  0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte  0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte  0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte  0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte  0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte  0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte  0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte  0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte  0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte  0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte  0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte  0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte  0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte  0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte  0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte  0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte  0x01,0x00,0x00,0x00,    0x02,0x00,0x00,0x00     # rcon
+.byte  0x04,0x00,0x00,0x00,    0x08,0x00,0x00,0x00
+.byte  0x10,0x00,0x00,0x00,    0x20,0x00,0x00,0x00
+.byte  0x40,0x00,0x00,0x00,    0x80,0x00,0x00,0x00
+.byte  0x1B,0x00,0x00,0x00,    0x36,0x00,0x00,0x00
+
+.align 6
+AES_Td:
+.byte  0x51,0xf4,0xa7,0x50,    0x7e,0x41,0x65,0x53     # Td0
+.byte  0x1a,0x17,0xa4,0xc3,    0x3a,0x27,0x5e,0x96
+.byte  0x3b,0xab,0x6b,0xcb,    0x1f,0x9d,0x45,0xf1
+.byte  0xac,0xfa,0x58,0xab,    0x4b,0xe3,0x03,0x93
+.byte  0x20,0x30,0xfa,0x55,    0xad,0x76,0x6d,0xf6
+.byte  0x88,0xcc,0x76,0x91,    0xf5,0x02,0x4c,0x25
+.byte  0x4f,0xe5,0xd7,0xfc,    0xc5,0x2a,0xcb,0xd7
+.byte  0x26,0x35,0x44,0x80,    0xb5,0x62,0xa3,0x8f
+.byte  0xde,0xb1,0x5a,0x49,    0x25,0xba,0x1b,0x67
+.byte  0x45,0xea,0x0e,0x98,    0x5d,0xfe,0xc0,0xe1
+.byte  0xc3,0x2f,0x75,0x02,    0x81,0x4c,0xf0,0x12
+.byte  0x8d,0x46,0x97,0xa3,    0x6b,0xd3,0xf9,0xc6
+.byte  0x03,0x8f,0x5f,0xe7,    0x15,0x92,0x9c,0x95
+.byte  0xbf,0x6d,0x7a,0xeb,    0x95,0x52,0x59,0xda
+.byte  0xd4,0xbe,0x83,0x2d,    0x58,0x74,0x21,0xd3
+.byte  0x49,0xe0,0x69,0x29,    0x8e,0xc9,0xc8,0x44
+.byte  0x75,0xc2,0x89,0x6a,    0xf4,0x8e,0x79,0x78
+.byte  0x99,0x58,0x3e,0x6b,    0x27,0xb9,0x71,0xdd
+.byte  0xbe,0xe1,0x4f,0xb6,    0xf0,0x88,0xad,0x17
+.byte  0xc9,0x20,0xac,0x66,    0x7d,0xce,0x3a,0xb4
+.byte  0x63,0xdf,0x4a,0x18,    0xe5,0x1a,0x31,0x82
+.byte  0x97,0x51,0x33,0x60,    0x62,0x53,0x7f,0x45
+.byte  0xb1,0x64,0x77,0xe0,    0xbb,0x6b,0xae,0x84
+.byte  0xfe,0x81,0xa0,0x1c,    0xf9,0x08,0x2b,0x94
+.byte  0x70,0x48,0x68,0x58,    0x8f,0x45,0xfd,0x19
+.byte  0x94,0xde,0x6c,0x87,    0x52,0x7b,0xf8,0xb7
+.byte  0xab,0x73,0xd3,0x23,    0x72,0x4b,0x02,0xe2
+.byte  0xe3,0x1f,0x8f,0x57,    0x66,0x55,0xab,0x2a
+.byte  0xb2,0xeb,0x28,0x07,    0x2f,0xb5,0xc2,0x03
+.byte  0x86,0xc5,0x7b,0x9a,    0xd3,0x37,0x08,0xa5
+.byte  0x30,0x28,0x87,0xf2,    0x23,0xbf,0xa5,0xb2
+.byte  0x02,0x03,0x6a,0xba,    0xed,0x16,0x82,0x5c
+.byte  0x8a,0xcf,0x1c,0x2b,    0xa7,0x79,0xb4,0x92
+.byte  0xf3,0x07,0xf2,0xf0,    0x4e,0x69,0xe2,0xa1
+.byte  0x65,0xda,0xf4,0xcd,    0x06,0x05,0xbe,0xd5
+.byte  0xd1,0x34,0x62,0x1f,    0xc4,0xa6,0xfe,0x8a
+.byte  0x34,0x2e,0x53,0x9d,    0xa2,0xf3,0x55,0xa0
+.byte  0x05,0x8a,0xe1,0x32,    0xa4,0xf6,0xeb,0x75
+.byte  0x0b,0x83,0xec,0x39,    0x40,0x60,0xef,0xaa
+.byte  0x5e,0x71,0x9f,0x06,    0xbd,0x6e,0x10,0x51
+.byte  0x3e,0x21,0x8a,0xf9,    0x96,0xdd,0x06,0x3d
+.byte  0xdd,0x3e,0x05,0xae,    0x4d,0xe6,0xbd,0x46
+.byte  0x91,0x54,0x8d,0xb5,    0x71,0xc4,0x5d,0x05
+.byte  0x04,0x06,0xd4,0x6f,    0x60,0x50,0x15,0xff
+.byte  0x19,0x98,0xfb,0x24,    0xd6,0xbd,0xe9,0x97
+.byte  0x89,0x40,0x43,0xcc,    0x67,0xd9,0x9e,0x77
+.byte  0xb0,0xe8,0x42,0xbd,    0x07,0x89,0x8b,0x88
+.byte  0xe7,0x19,0x5b,0x38,    0x79,0xc8,0xee,0xdb
+.byte  0xa1,0x7c,0x0a,0x47,    0x7c,0x42,0x0f,0xe9
+.byte  0xf8,0x84,0x1e,0xc9,    0x00,0x00,0x00,0x00
+.byte  0x09,0x80,0x86,0x83,    0x32,0x2b,0xed,0x48
+.byte  0x1e,0x11,0x70,0xac,    0x6c,0x5a,0x72,0x4e
+.byte  0xfd,0x0e,0xff,0xfb,    0x0f,0x85,0x38,0x56
+.byte  0x3d,0xae,0xd5,0x1e,    0x36,0x2d,0x39,0x27
+.byte  0x0a,0x0f,0xd9,0x64,    0x68,0x5c,0xa6,0x21
+.byte  0x9b,0x5b,0x54,0xd1,    0x24,0x36,0x2e,0x3a
+.byte  0x0c,0x0a,0x67,0xb1,    0x93,0x57,0xe7,0x0f
+.byte  0xb4,0xee,0x96,0xd2,    0x1b,0x9b,0x91,0x9e
+.byte  0x80,0xc0,0xc5,0x4f,    0x61,0xdc,0x20,0xa2
+.byte  0x5a,0x77,0x4b,0x69,    0x1c,0x12,0x1a,0x16
+.byte  0xe2,0x93,0xba,0x0a,    0xc0,0xa0,0x2a,0xe5
+.byte  0x3c,0x22,0xe0,0x43,    0x12,0x1b,0x17,0x1d
+.byte  0x0e,0x09,0x0d,0x0b,    0xf2,0x8b,0xc7,0xad
+.byte  0x2d,0xb6,0xa8,0xb9,    0x14,0x1e,0xa9,0xc8
+.byte  0x57,0xf1,0x19,0x85,    0xaf,0x75,0x07,0x4c
+.byte  0xee,0x99,0xdd,0xbb,    0xa3,0x7f,0x60,0xfd
+.byte  0xf7,0x01,0x26,0x9f,    0x5c,0x72,0xf5,0xbc
+.byte  0x44,0x66,0x3b,0xc5,    0x5b,0xfb,0x7e,0x34
+.byte  0x8b,0x43,0x29,0x76,    0xcb,0x23,0xc6,0xdc
+.byte  0xb6,0xed,0xfc,0x68,    0xb8,0xe4,0xf1,0x63
+.byte  0xd7,0x31,0xdc,0xca,    0x42,0x63,0x85,0x10
+.byte  0x13,0x97,0x22,0x40,    0x84,0xc6,0x11,0x20
+.byte  0x85,0x4a,0x24,0x7d,    0xd2,0xbb,0x3d,0xf8
+.byte  0xae,0xf9,0x32,0x11,    0xc7,0x29,0xa1,0x6d
+.byte  0x1d,0x9e,0x2f,0x4b,    0xdc,0xb2,0x30,0xf3
+.byte  0x0d,0x86,0x52,0xec,    0x77,0xc1,0xe3,0xd0
+.byte  0x2b,0xb3,0x16,0x6c,    0xa9,0x70,0xb9,0x99
+.byte  0x11,0x94,0x48,0xfa,    0x47,0xe9,0x64,0x22
+.byte  0xa8,0xfc,0x8c,0xc4,    0xa0,0xf0,0x3f,0x1a
+.byte  0x56,0x7d,0x2c,0xd8,    0x22,0x33,0x90,0xef
+.byte  0x87,0x49,0x4e,0xc7,    0xd9,0x38,0xd1,0xc1
+.byte  0x8c,0xca,0xa2,0xfe,    0x98,0xd4,0x0b,0x36
+.byte  0xa6,0xf5,0x81,0xcf,    0xa5,0x7a,0xde,0x28
+.byte  0xda,0xb7,0x8e,0x26,    0x3f,0xad,0xbf,0xa4
+.byte  0x2c,0x3a,0x9d,0xe4,    0x50,0x78,0x92,0x0d
+.byte  0x6a,0x5f,0xcc,0x9b,    0x54,0x7e,0x46,0x62
+.byte  0xf6,0x8d,0x13,0xc2,    0x90,0xd8,0xb8,0xe8
+.byte  0x2e,0x39,0xf7,0x5e,    0x82,0xc3,0xaf,0xf5
+.byte  0x9f,0x5d,0x80,0xbe,    0x69,0xd0,0x93,0x7c
+.byte  0x6f,0xd5,0x2d,0xa9,    0xcf,0x25,0x12,0xb3
+.byte  0xc8,0xac,0x99,0x3b,    0x10,0x18,0x7d,0xa7
+.byte  0xe8,0x9c,0x63,0x6e,    0xdb,0x3b,0xbb,0x7b
+.byte  0xcd,0x26,0x78,0x09,    0x6e,0x59,0x18,0xf4
+.byte  0xec,0x9a,0xb7,0x01,    0x83,0x4f,0x9a,0xa8
+.byte  0xe6,0x95,0x6e,0x65,    0xaa,0xff,0xe6,0x7e
+.byte  0x21,0xbc,0xcf,0x08,    0xef,0x15,0xe8,0xe6
+.byte  0xba,0xe7,0x9b,0xd9,    0x4a,0x6f,0x36,0xce
+.byte  0xea,0x9f,0x09,0xd4,    0x29,0xb0,0x7c,0xd6
+.byte  0x31,0xa4,0xb2,0xaf,    0x2a,0x3f,0x23,0x31
+.byte  0xc6,0xa5,0x94,0x30,    0x35,0xa2,0x66,0xc0
+.byte  0x74,0x4e,0xbc,0x37,    0xfc,0x82,0xca,0xa6
+.byte  0xe0,0x90,0xd0,0xb0,    0x33,0xa7,0xd8,0x15
+.byte  0xf1,0x04,0x98,0x4a,    0x41,0xec,0xda,0xf7
+.byte  0x7f,0xcd,0x50,0x0e,    0x17,0x91,0xf6,0x2f
+.byte  0x76,0x4d,0xd6,0x8d,    0x43,0xef,0xb0,0x4d
+.byte  0xcc,0xaa,0x4d,0x54,    0xe4,0x96,0x04,0xdf
+.byte  0x9e,0xd1,0xb5,0xe3,    0x4c,0x6a,0x88,0x1b
+.byte  0xc1,0x2c,0x1f,0xb8,    0x46,0x65,0x51,0x7f
+.byte  0x9d,0x5e,0xea,0x04,    0x01,0x8c,0x35,0x5d
+.byte  0xfa,0x87,0x74,0x73,    0xfb,0x0b,0x41,0x2e
+.byte  0xb3,0x67,0x1d,0x5a,    0x92,0xdb,0xd2,0x52
+.byte  0xe9,0x10,0x56,0x33,    0x6d,0xd6,0x47,0x13
+.byte  0x9a,0xd7,0x61,0x8c,    0x37,0xa1,0x0c,0x7a
+.byte  0x59,0xf8,0x14,0x8e,    0xeb,0x13,0x3c,0x89
+.byte  0xce,0xa9,0x27,0xee,    0xb7,0x61,0xc9,0x35
+.byte  0xe1,0x1c,0xe5,0xed,    0x7a,0x47,0xb1,0x3c
+.byte  0x9c,0xd2,0xdf,0x59,    0x55,0xf2,0x73,0x3f
+.byte  0x18,0x14,0xce,0x79,    0x73,0xc7,0x37,0xbf
+.byte  0x53,0xf7,0xcd,0xea,    0x5f,0xfd,0xaa,0x5b
+.byte  0xdf,0x3d,0x6f,0x14,    0x78,0x44,0xdb,0x86
+.byte  0xca,0xaf,0xf3,0x81,    0xb9,0x68,0xc4,0x3e
+.byte  0x38,0x24,0x34,0x2c,    0xc2,0xa3,0x40,0x5f
+.byte  0x16,0x1d,0xc3,0x72,    0xbc,0xe2,0x25,0x0c
+.byte  0x28,0x3c,0x49,0x8b,    0xff,0x0d,0x95,0x41
+.byte  0x39,0xa8,0x01,0x71,    0x08,0x0c,0xb3,0xde
+.byte  0xd8,0xb4,0xe4,0x9c,    0x64,0x56,0xc1,0x90
+.byte  0x7b,0xcb,0x84,0x61,    0xd5,0x32,0xb6,0x70
+.byte  0x48,0x6c,0x5c,0x74,    0xd0,0xb8,0x57,0x42
+
+.byte  0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38  # Td4
+.byte  0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte  0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte  0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte  0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte  0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte  0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte  0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte  0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte  0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte  0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte  0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte  0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte  0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte  0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte  0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte  0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte  0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte  0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte  0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte  0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte  0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte  0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte  0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte  0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte  0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte  0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte  0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte  0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte  0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte  0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte  0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+___
+\f
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/ge;
+
+       # made-up _instructions, _xtr, _ins, _ror and _bias, cope
+       # with byte order dependencies...
+       if (/^\s+_/) {
+           s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
+
+           s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
+               sprintf("srl\t$1,$2,%d",$big_endian ?   eval($3)
+                                       :               eval("24-$3"))/e or
+           s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+               sprintf("sll\t$1,$2,%d",$big_endian ?   eval($3)
+                                       :               eval("24-$3"))/e or
+           s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
+               sprintf("srl\t$1,$2,%d",$big_endian ?   eval($3)
+                                       :               eval("$3*-1"))/e or
+           s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+               sprintf("sll\t$1,$2,%d",$big_endian ?   eval($3)
+                                       :               eval("($3-16)&31"))/e;
+
+           s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
+               sprintf("sll\t$1,$2,$3")/e                              or
+           s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
+               sprintf("and\t$1,$2,0xff")/e                            or
+           s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
+       }
+
+       # convert lwl/lwr and swr/swl to little-endian order
+       if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
+           s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
+               sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e        or
+           s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
+               sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
+       }
+
+       print $_,"\n";
+}
+
+close STDOUT;
diff --git a/crypto/bn/asm/mips-mont.pl b/crypto/bn/asm/mips-mont.pl
new file mode 100644 (file)
index 0000000..b944a12
--- /dev/null
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+       $PTR_ADD="dadd";        # incidentally works even on n32
+       $PTR_SUB="dsub";        # incidentally works even on n32
+       $REG_S="sd";
+       $REG_L="ld";
+       $SZREG=8;
+} else {
+       $PTR_ADD="add";
+       $PTR_SUB="sub";
+       $REG_S="sw";
+       $REG_L="lw";
+       $SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+       $LD="ld";
+       $ST="sd";
+       $MULTU="dmultu";
+       $ADDU="daddu";
+       $SUBU="dsubu";
+       $BNSZ=8;
+} else {
+       $LD="lw";
+       $ST="sw";
+       $MULTU="multu";
+       $ADDU="addu";
+       $SUBU="subu";
+       $BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0;       # BN_ULONG *rp,
+$ap=$a1;       # const BN_ULONG *ap,
+$bp=$a2;       # const BN_ULONG *bp,
+$np=$a3;       # const BN_ULONG *np,
+$n0=$a4;       # const BN_ULONG *n0,
+$num=$a5;      # int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set   noat
+.set   noreorder
+
+.align 5
+.globl bn_mul_mont
+.ent   bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+       lw      $n0,16($sp)
+       lw      $num,20($sp)
+___
+$code.=<<___;
+       slt     $at,$num,4
+       bnez    $at,1f
+       li      $t0,0
+       slt     $at,$num,17     # on in-order CPU
+       bnezl   $at,bn_mul_mont_internal
+       nop
+1:     jr      $ra
+       li      $a0,0
+.end   bn_mul_mont
+
+.align 5
+.ent   bn_mul_mont_internal
+bn_mul_mont_internal:
+       .frame  $fp,$FRAMESIZE*$SZREG,$ra
+       .mask   0x40000000|$SAVED_REGS_MASK,-$SZREG
+       $PTR_SUB $sp,$FRAMESIZE*$SZREG
+       $REG_S  $fp,($FRAMESIZE-1)*$SZREG($sp)
+       $REG_S  $s11,($FRAMESIZE-2)*$SZREG($sp)
+       $REG_S  $s10,($FRAMESIZE-3)*$SZREG($sp)
+       $REG_S  $s9,($FRAMESIZE-4)*$SZREG($sp)
+       $REG_S  $s8,($FRAMESIZE-5)*$SZREG($sp)
+       $REG_S  $s7,($FRAMESIZE-6)*$SZREG($sp)
+       $REG_S  $s6,($FRAMESIZE-7)*$SZREG($sp)
+       $REG_S  $s5,($FRAMESIZE-8)*$SZREG($sp)
+       $REG_S  $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_S  $s3,($FRAMESIZE-10)*$SZREG($sp)
+       $REG_S  $s2,($FRAMESIZE-11)*$SZREG($sp)
+       $REG_S  $s1,($FRAMESIZE-12)*$SZREG($sp)
+       $REG_S  $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+       move    $fp,$sp
+
+       .set    reorder
+       $LD     $n0,0($n0)
+       $LD     $bi,0($bp)      # bp[0]
+       $LD     $aj,0($ap)      # ap[0]
+       $LD     $nj,0($np)      # np[0]
+
+       $PTR_SUB $sp,2*$BNSZ    # place for two extra words
+       sll     $num,`log($BNSZ)/log(2)`
+       li      $at,-4096
+       $PTR_SUB $sp,$num
+       and     $sp,$at
+
+       $MULTU  $aj,$bi
+       $LD     $alo,$BNSZ($ap)
+       $LD     $nlo,$BNSZ($np)
+       mflo    $lo0
+       mfhi    $hi0
+       $MULTU  $lo0,$n0
+       mflo    $m1
+
+       $MULTU  $alo,$bi
+       mflo    $alo
+       mfhi    $ahi
+
+       $MULTU  $nj,$m1
+       mflo    $lo1
+       mfhi    $hi1
+       $MULTU  $nlo,$m1
+       $ADDU   $lo1,$lo0
+       sltu    $at,$lo1,$lo0
+       $ADDU   $hi1,$at
+       mflo    $nlo
+       mfhi    $nhi
+
+       move    $tp,$sp
+       li      $j,2*$BNSZ
+.align 4
+.L1st:
+       .set    noreorder
+       $PTR_ADD $aj,$ap,$j
+       $PTR_ADD $nj,$np,$j
+       $LD     $aj,($aj)
+       $LD     $nj,($nj)
+
+       $MULTU  $aj,$bi
+       $ADDU   $lo0,$alo,$hi0
+       $ADDU   $lo1,$nlo,$hi1
+       sltu    $at,$lo0,$hi0
+       sltu    $t0,$lo1,$hi1
+       $ADDU   $hi0,$ahi,$at
+       $ADDU   $hi1,$nhi,$t0
+       mflo    $alo
+       mfhi    $ahi
+
+       $ADDU   $lo1,$lo0
+       sltu    $at,$lo1,$lo0
+       $MULTU  $nj,$m1
+       $ADDU   $hi1,$at
+       addu    $j,$BNSZ
+       $ST     $lo1,($tp)
+       sltu    $t0,$j,$num
+       mflo    $nlo
+       mfhi    $nhi
+
+       bnez    $t0,.L1st
+       $PTR_ADD $tp,$BNSZ
+       .set    reorder
+
+       $ADDU   $lo0,$alo,$hi0
+       sltu    $at,$lo0,$hi0
+       $ADDU   $hi0,$ahi,$at
+
+       $ADDU   $lo1,$nlo,$hi1
+       sltu    $t0,$lo1,$hi1
+       $ADDU   $hi1,$nhi,$t0
+       $ADDU   $lo1,$lo0
+       sltu    $at,$lo1,$lo0
+       $ADDU   $hi1,$at
+
+       $ST     $lo1,($tp)
+
+       $ADDU   $hi1,$hi0
+       sltu    $at,$hi1,$hi0
+       $ST     $hi1,$BNSZ($tp)
+       $ST     $at,2*$BNSZ($tp)
+
+       li      $i,$BNSZ
+.align 4
+.Louter:
+       $PTR_ADD $bi,$bp,$i
+       $LD     $bi,($bi)
+       $LD     $aj,($ap)
+       $LD     $alo,$BNSZ($ap)
+       $LD     $tj,($sp)
+
+       $MULTU  $aj,$bi
+       $LD     $nj,($np)
+       $LD     $nlo,$BNSZ($np)
+       mflo    $lo0
+       mfhi    $hi0
+       $ADDU   $lo0,$tj
+       $MULTU  $lo0,$n0
+       sltu    $at,$lo0,$tj
+       $ADDU   $hi0,$at
+       mflo    $m1
+
+       $MULTU  $alo,$bi
+       mflo    $alo
+       mfhi    $ahi
+
+       $MULTU  $nj,$m1
+       mflo    $lo1
+       mfhi    $hi1
+
+       $MULTU  $nlo,$m1
+       $ADDU   $lo1,$lo0
+       sltu    $at,$lo1,$lo0
+       $ADDU   $hi1,$at
+       mflo    $nlo
+       mfhi    $nhi
+
+       move    $tp,$sp
+       li      $j,2*$BNSZ
+       $LD     $tj,$BNSZ($tp)
+.align 4
+.Linner:
+       .set    noreorder
+       $PTR_ADD $aj,$ap,$j
+       $PTR_ADD $nj,$np,$j
+       $LD     $aj,($aj)
+       $LD     $nj,($nj)
+
+       $MULTU  $aj,$bi
+       $ADDU   $lo0,$alo,$hi0
+       $ADDU   $lo1,$nlo,$hi1
+       sltu    $at,$lo0,$hi0
+       sltu    $t0,$lo1,$hi1
+       $ADDU   $hi0,$ahi,$at
+       $ADDU   $hi1,$nhi,$t0
+       mflo    $alo
+       mfhi    $ahi
+
+       $ADDU   $lo0,$tj
+       addu    $j,$BNSZ
+       $MULTU  $nj,$m1
+       sltu    $at,$lo0,$tj
+       $ADDU   $lo1,$lo0
+       $ADDU   $hi0,$at
+       sltu    $t0,$lo1,$lo0
+       $LD     $tj,2*$BNSZ($tp)
+       $ADDU   $hi1,$t0
+       sltu    $at,$j,$num
+       mflo    $nlo
+       mfhi    $nhi
+       $ST     $lo1,($tp)
+       bnez    $at,.Linner
+       $PTR_ADD $tp,$BNSZ
+       .set    reorder
+
+       $ADDU   $lo0,$alo,$hi0
+       sltu    $at,$lo0,$hi0
+       $ADDU   $hi0,$ahi,$at
+       $ADDU   $lo0,$tj
+       sltu    $t0,$lo0,$tj
+       $ADDU   $hi0,$t0
+
+       $LD     $tj,2*$BNSZ($tp)
+       $ADDU   $lo1,$nlo,$hi1
+       sltu    $at,$lo1,$hi1
+       $ADDU   $hi1,$nhi,$at
+       $ADDU   $lo1,$lo0
+       sltu    $t0,$lo1,$lo0
+       $ADDU   $hi1,$t0
+       $ST     $lo1,($tp)
+
+       $ADDU   $lo1,$hi1,$hi0
+       sltu    $hi1,$lo1,$hi0
+       $ADDU   $lo1,$tj
+       sltu    $at,$lo1,$tj
+       $ADDU   $hi1,$at
+       $ST     $lo1,$BNSZ($tp)
+       $ST     $hi1,2*$BNSZ($tp)
+
+       addu    $i,$BNSZ
+       sltu    $t0,$i,$num
+       bnez    $t0,.Louter
+\f
+       .set    noreorder
+       $PTR_ADD $tj,$sp,$num   # &tp[num]
+       move    $tp,$sp
+       move    $ap,$sp
+       li      $hi0,0          # clear borrow bit
+
+.align 4
+.Lsub: $LD     $lo0,($tp)
+       $LD     $lo1,($np)
+       $PTR_ADD $tp,$BNSZ
+       $PTR_ADD $np,$BNSZ
+       $SUBU   $lo1,$lo0,$lo1  # tp[i]-np[i]
+       sgtu    $at,$lo1,$lo0
+       $SUBU   $lo0,$lo1,$hi0
+       sgtu    $hi0,$lo0,$lo1
+       $ST     $lo0,($rp)
+       or      $hi0,$at
+       sltu    $at,$tp,$tj
+       bnez    $at,.Lsub
+       $PTR_ADD $rp,$BNSZ
+
+       $SUBU   $hi0,$hi1,$hi0  # handle upmost overflow bit
+       move    $tp,$sp
+       $PTR_SUB $rp,$num       # restore rp
+       not     $hi1,$hi0
+
+       and     $ap,$hi0,$sp
+       and     $bp,$hi1,$rp
+       or      $ap,$ap,$bp     # ap=borrow?tp:rp
+
+.align 4
+.Lcopy:        $LD     $aj,($ap)
+       $PTR_ADD $ap,$BNSZ
+       $ST     $zero,($tp)
+       $PTR_ADD $tp,$BNSZ
+       sltu    $at,$tp,$tj
+       $ST     $aj,($rp)
+       bnez    $at,.Lcopy
+       $PTR_ADD $rp,$BNSZ
+
+       li      $a0,1
+       li      $t0,1
+
+       .set    noreorder
+       move    $sp,$fp
+       $REG_L  $fp,($FRAMESIZE-1)*$SZREG($sp)
+       $REG_L  $s11,($FRAMESIZE-2)*$SZREG($sp)
+       $REG_L  $s10,($FRAMESIZE-3)*$SZREG($sp)
+       $REG_L  $s9,($FRAMESIZE-4)*$SZREG($sp)
+       $REG_L  $s8,($FRAMESIZE-5)*$SZREG($sp)
+       $REG_L  $s7,($FRAMESIZE-6)*$SZREG($sp)
+       $REG_L  $s6,($FRAMESIZE-7)*$SZREG($sp)
+       $REG_L  $s5,($FRAMESIZE-8)*$SZREG($sp)
+       $REG_L  $s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $s3,($FRAMESIZE-10)*$SZREG($sp)
+       $REG_L  $s2,($FRAMESIZE-11)*$SZREG($sp)
+       $REG_L  $s1,($FRAMESIZE-12)*$SZREG($sp)
+       $REG_L  $s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+       jr      $ra
+       $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end   bn_mul_mont_internal
+.rdata
+.asciiz        "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/crypto/bn/asm/mips.pl b/crypto/bn/asm/mips.pl
new file mode 100644 (file)
index 0000000..acfd359
--- /dev/null
@@ -0,0 +1,2585 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project.
+#
+# Rights for redistribution and usage in source and binary forms are
+# granted according to the OpenSSL license. Warranty of any kind is
+# disclaimed.
+# ====================================================================
+
+
+# July 1999
+#
+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
+#
+# The module is designed to work with either of the "new" MIPS ABI(5),
+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+# IRIX 5.x not only because it doesn't support new ABIs but also
+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+# cause illegal instruction exception:-(
+#
+# In addition the code depends on preprocessor flags set up by MIPSpro
+# compiler driver (either as or cc) and therefore (probably?) can't be
+# compiled by the GNU assembler. GNU C driver manages fine though...
+# I mean as long as -mmips-as is specified or is the default option,
+# because then it simply invokes /usr/bin/as which in turn takes
+# perfect care of the preprocessor definitions. Another neat feature
+# offered by the MIPSpro assembler is an optimization pass. This gave
+# me the opportunity to have the code looking more regular as all those
+# architecture dependent instruction rescheduling details were left to
+# the assembler. Cool, huh?
+#
+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+# goes way over 3 times faster!
+#
+#                                      <appro@fy.chalmers.se>
+
+# October 2010
+#
+# Adapt the module even for 32-bit ABIs and other OSes. The former was
+# achieved by mechanical replacement of 64-bit arithmetic instructions
+# such as dmultu, daddu, etc. with their 32-bit counterparts and
+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
+# >3x performance improvement naturally does not apply to 32-bit code
+# [because there is no instruction 32-bit compiler can't use], one
+# has to content with 40-85% improvement depending on benchmark and
+# key length, more for longer keys.
+
+$flavour = shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+       $LD="ld";
+       $ST="sd";
+       $MULTU="dmultu";
+       $DIVU="ddivu";
+       $ADDU="daddu";
+       $SUBU="dsubu";
+       $SRL="dsrl";
+       $SLL="dsll";
+       $BNSZ=8;
+       $PTR_ADD="daddu";
+       $PTR_SUB="dsubu";
+       $SZREG=8;
+       $REG_S="sd";
+       $REG_L="ld";
+} else {
+       $LD="lw";
+       $ST="sw";
+       $MULTU="multu";
+       $DIVU="divu";
+       $ADDU="addu";
+       $SUBU="subu";
+       $SRL="srl";
+       $SLL="sll";
+       $BNSZ=4;
+       $PTR_ADD="addu";
+       $PTR_SUB="subu";
+       $SZREG=4;
+       $REG_S="sw";
+       $REG_L="lw";
+       $code=".set     mips2\n";
+}
+
+# Below is N32/64 register layout used in the original module.
+#
+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
+#
+# No special adaptation is required for O32. NUBI on the other hand
+# is treated by saving/restoring ($v1,$t0..$t3).
+
+$gp=$v1 if ($flavour =~ /nubi/i);
+
+$minus4=$v1;
+
+$code.=<<___;
+.rdata
+.asciiz        "mips3.s, Version 1.2"
+.asciiz        "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
+
+.text
+.set   noat
+
+.align 5
+.globl bn_mul_add_words
+.ent   bn_mul_add_words
+bn_mul_add_words:
+       .set    noreorder
+       bgtz    $a2,bn_mul_add_words_internal
+       move    $v0,$zero
+       jr      $ra
+       move    $a0,$v0
+.end   bn_mul_add_words
+
+.align 5
+.ent   bn_mul_add_words_internal
+bn_mul_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       li      $minus4,-4
+       and     $ta0,$a2,$minus4
+       $LD     $t0,0($a1)
+       beqz    $ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+       $MULTU  $t0,$a3
+       $LD     $t1,0($a0)
+       $LD     $t2,$BNSZ($a1)
+       $LD     $t3,$BNSZ($a0)
+       $LD     $ta0,2*$BNSZ($a1)
+       $LD     $ta1,2*$BNSZ($a0)
+       $ADDU   $t1,$v0
+       sltu    $v0,$t1,$v0     # All manuals say it "compares 32-bit
+                               # values", but it seems to work fine
+                               # even on 64-bit registers.
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $t1,$at
+       $ADDU   $v0,$t0
+        $MULTU $t2,$a3
+       sltu    $at,$t1,$at
+       $ST     $t1,0($a0)
+       $ADDU   $v0,$at
+
+       $LD     $ta2,3*$BNSZ($a1)
+       $LD     $ta3,3*$BNSZ($a0)
+       $ADDU   $t3,$v0
+       sltu    $v0,$t3,$v0
+       mflo    $at
+       mfhi    $t2
+       $ADDU   $t3,$at
+       $ADDU   $v0,$t2
+        $MULTU $ta0,$a3
+       sltu    $at,$t3,$at
+       $ST     $t3,$BNSZ($a0)
+       $ADDU   $v0,$at
+
+       subu    $a2,4
+       $PTR_ADD $a0,4*$BNSZ
+       $PTR_ADD $a1,4*$BNSZ
+       $ADDU   $ta1,$v0
+       sltu    $v0,$ta1,$v0
+       mflo    $at
+       mfhi    $ta0
+       $ADDU   $ta1,$at
+       $ADDU   $v0,$ta0
+        $MULTU $ta2,$a3
+       sltu    $at,$ta1,$at
+       $ST     $ta1,-2*$BNSZ($a0)
+       $ADDU   $v0,$at
+
+
+       and     $ta0,$a2,$minus4
+       $ADDU   $ta3,$v0
+       sltu    $v0,$ta3,$v0
+       mflo    $at
+       mfhi    $ta2
+       $ADDU   $ta3,$at
+       $ADDU   $v0,$ta2
+       sltu    $at,$ta3,$at
+       $ST     $ta3,-$BNSZ($a0)
+       $ADDU   $v0,$at
+       .set    noreorder
+       bgtzl   $ta0,.L_bn_mul_add_words_loop
+       $LD     $t0,0($a1)
+
+       beqz    $a2,.L_bn_mul_add_words_return
+       nop
+
+.L_bn_mul_add_words_tail:
+       .set    reorder
+       $LD     $t0,0($a1)
+       $MULTU  $t0,$a3
+       $LD     $t1,0($a0)
+       subu    $a2,1
+       $ADDU   $t1,$v0
+       sltu    $v0,$t1,$v0
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $t1,$at
+       $ADDU   $v0,$t0
+       sltu    $at,$t1,$at
+       $ST     $t1,0($a0)
+       $ADDU   $v0,$at
+       beqz    $a2,.L_bn_mul_add_words_return
+
+       $LD     $t0,$BNSZ($a1)
+       $MULTU  $t0,$a3
+       $LD     $t1,$BNSZ($a0)
+       subu    $a2,1
+       $ADDU   $t1,$v0
+       sltu    $v0,$t1,$v0
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $t1,$at
+       $ADDU   $v0,$t0
+       sltu    $at,$t1,$at
+       $ST     $t1,$BNSZ($a0)
+       $ADDU   $v0,$at
+       beqz    $a2,.L_bn_mul_add_words_return
+
+       $LD     $t0,2*$BNSZ($a1)
+       $MULTU  $t0,$a3
+       $LD     $t1,2*$BNSZ($a0)
+       $ADDU   $t1,$v0
+       sltu    $v0,$t1,$v0
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $t1,$at
+       $ADDU   $v0,$t0
+       sltu    $at,$t1,$at
+       $ST     $t1,2*$BNSZ($a0)
+       $ADDU   $v0,$at
+
+.L_bn_mul_add_words_return:
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       move    $a0,$v0
+.end   bn_mul_add_words
+
+.align 5
+.globl bn_mul_words
+.ent   bn_mul_words
+bn_mul_words:
+       .set    noreorder
+       bgtz    $a2,bn_mul_words_internal
+       move    $v0,$zero
+       jr      $ra
+       move    $a0,$v0
+.end   bn_mul_words
+
+.align 5
+.ent   bn_mul_words_internal
+bn_mul_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       li      $minus4,-4
+       and     $ta0,$a2,$minus4
+       $LD     $t0,0($a1)
+       beqz    $ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+       $MULTU  $t0,$a3
+       $LD     $t2,$BNSZ($a1)
+       $LD     $ta0,2*$BNSZ($a1)
+       $LD     $ta2,3*$BNSZ($a1)
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $v0,$at
+       sltu    $t1,$v0,$at
+        $MULTU $t2,$a3
+       $ST     $v0,0($a0)
+       $ADDU   $v0,$t1,$t0
+
+       subu    $a2,4
+       $PTR_ADD $a0,4*$BNSZ
+       $PTR_ADD $a1,4*$BNSZ
+       mflo    $at
+       mfhi    $t2
+       $ADDU   $v0,$at
+       sltu    $t3,$v0,$at
+        $MULTU $ta0,$a3
+       $ST     $v0,-3*$BNSZ($a0)
+       $ADDU   $v0,$t3,$t2
+
+       mflo    $at
+       mfhi    $ta0
+       $ADDU   $v0,$at
+       sltu    $ta1,$v0,$at
+        $MULTU $ta2,$a3
+       $ST     $v0,-2*$BNSZ($a0)
+       $ADDU   $v0,$ta1,$ta0
+
+       and     $ta0,$a2,$minus4
+       mflo    $at
+       mfhi    $ta2
+       $ADDU   $v0,$at
+       sltu    $ta3,$v0,$at
+       $ST     $v0,-$BNSZ($a0)
+       $ADDU   $v0,$ta3,$ta2
+       .set    noreorder
+       bgtzl   $ta0,.L_bn_mul_words_loop
+       $LD     $t0,0($a1)
+
+       beqz    $a2,.L_bn_mul_words_return
+       nop
+
+.L_bn_mul_words_tail:
+       .set    reorder
+       $LD     $t0,0($a1)
+       $MULTU  $t0,$a3
+       subu    $a2,1
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $v0,$at
+       sltu    $t1,$v0,$at
+       $ST     $v0,0($a0)
+       $ADDU   $v0,$t1,$t0
+       beqz    $a2,.L_bn_mul_words_return
+
+       $LD     $t0,$BNSZ($a1)
+       $MULTU  $t0,$a3
+       subu    $a2,1
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $v0,$at
+       sltu    $t1,$v0,$at
+       $ST     $v0,$BNSZ($a0)
+       $ADDU   $v0,$t1,$t0
+       beqz    $a2,.L_bn_mul_words_return
+
+       $LD     $t0,2*$BNSZ($a1)
+       $MULTU  $t0,$a3
+       mflo    $at
+       mfhi    $t0
+       $ADDU   $v0,$at
+       sltu    $t1,$v0,$at
+       $ST     $v0,2*$BNSZ($a0)
+       $ADDU   $v0,$t1,$t0
+
+.L_bn_mul_words_return:
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       move    $a0,$v0
+.end   bn_mul_words_internal
+
+.align 5
+.globl bn_sqr_words
+.ent   bn_sqr_words
+bn_sqr_words:
+       .set    noreorder
+       bgtz    $a2,bn_sqr_words_internal
+       move    $v0,$zero
+       jr      $ra
+       move    $a0,$v0
+.end   bn_sqr_words
+
+.align 5
+.ent   bn_sqr_words_internal
+bn_sqr_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       li      $minus4,-4
+       and     $ta0,$a2,$minus4
+       $LD     $t0,0($a1)
+       beqz    $ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+       $MULTU  $t0,$t0
+       $LD     $t2,$BNSZ($a1)
+       $LD     $ta0,2*$BNSZ($a1)
+       $LD     $ta2,3*$BNSZ($a1)
+       mflo    $t1
+       mfhi    $t0
+       $ST     $t1,0($a0)
+       $ST     $t0,$BNSZ($a0)
+
+       $MULTU  $t2,$t2
+       subu    $a2,4
+       $PTR_ADD $a0,8*$BNSZ
+       $PTR_ADD $a1,4*$BNSZ
+       mflo    $t3
+       mfhi    $t2
+       $ST     $t3,-6*$BNSZ($a0)
+       $ST     $t2,-5*$BNSZ($a0)
+
+       $MULTU  $ta0,$ta0
+       mflo    $ta1
+       mfhi    $ta0
+       $ST     $ta1,-4*$BNSZ($a0)
+       $ST     $ta0,-3*$BNSZ($a0)
+
+
+       $MULTU  $ta2,$ta2
+       and     $ta0,$a2,$minus4
+       mflo    $ta3
+       mfhi    $ta2
+       $ST     $ta3,-2*$BNSZ($a0)
+       $ST     $ta2,-$BNSZ($a0)
+
+       .set    noreorder
+       bgtzl   $ta0,.L_bn_sqr_words_loop
+       $LD     $t0,0($a1)
+
+       beqz    $a2,.L_bn_sqr_words_return
+       nop
+
+.L_bn_sqr_words_tail:
+       .set    reorder
+       $LD     $t0,0($a1)
+       $MULTU  $t0,$t0
+       subu    $a2,1
+       mflo    $t1
+       mfhi    $t0
+       $ST     $t1,0($a0)
+       $ST     $t0,$BNSZ($a0)
+       beqz    $a2,.L_bn_sqr_words_return
+
+       $LD     $t0,$BNSZ($a1)
+       $MULTU  $t0,$t0
+       subu    $a2,1
+       mflo    $t1
+       mfhi    $t0
+       $ST     $t1,2*$BNSZ($a0)
+       $ST     $t0,3*$BNSZ($a0)
+       beqz    $a2,.L_bn_sqr_words_return
+
+       $LD     $t0,2*$BNSZ($a1)
+       $MULTU  $t0,$t0
+       mflo    $t1
+       mfhi    $t0
+       $ST     $t1,4*$BNSZ($a0)
+       $ST     $t0,5*$BNSZ($a0)
+
+.L_bn_sqr_words_return:
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       move    $a0,$v0
+
+.end   bn_sqr_words_internal
+
+.align 5
+.globl bn_add_words
+.ent   bn_add_words
+bn_add_words:
+       .set    noreorder
+       bgtz    $a3,bn_add_words_internal
+       move    $v0,$zero
+       jr      $ra
+       move    $a0,$v0
+.end   bn_add_words
+
+.align 5
+.ent   bn_add_words_internal
+bn_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       li      $minus4,-4
+       and     $at,$a3,$minus4
+       $LD     $t0,0($a1)
+       beqz    $at,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+       $LD     $ta0,0($a2)
+       subu    $a3,4
+       $LD     $t1,$BNSZ($a1)
+       and     $at,$a3,$minus4
+       $LD     $t2,2*$BNSZ($a1)
+       $PTR_ADD $a2,4*$BNSZ
+       $LD     $t3,3*$BNSZ($a1)
+       $PTR_ADD $a0,4*$BNSZ
+       $LD     $ta1,-3*$BNSZ($a2)
+       $PTR_ADD $a1,4*$BNSZ
+       $LD     $ta2,-2*$BNSZ($a2)
+       $LD     $ta3,-$BNSZ($a2)
+       $ADDU   $ta0,$t0
+       sltu    $t8,$ta0,$t0
+       $ADDU   $t0,$ta0,$v0
+       sltu    $v0,$t0,$ta0
+       $ST     $t0,-4*$BNSZ($a0)
+       $ADDU   $v0,$t8
+
+       $ADDU   $ta1,$t1
+       sltu    $t9,$ta1,$t1
+       $ADDU   $t1,$ta1,$v0
+       sltu    $v0,$t1,$ta1
+       $ST     $t1,-3*$BNSZ($a0)
+       $ADDU   $v0,$t9
+
+       $ADDU   $ta2,$t2
+       sltu    $t8,$ta2,$t2
+       $ADDU   $t2,$ta2,$v0
+       sltu    $v0,$t2,$ta2
+       $ST     $t2,-2*$BNSZ($a0)
+       $ADDU   $v0,$t8
+       
+       $ADDU   $ta3,$t3
+       sltu    $t9,$ta3,$t3
+       $ADDU   $t3,$ta3,$v0
+       sltu    $v0,$t3,$ta3
+       $ST     $t3,-$BNSZ($a0)
+       $ADDU   $v0,$t9
+       
+       .set    noreorder
+       bgtzl   $at,.L_bn_add_words_loop
+       $LD     $t0,0($a1)
+
+       beqz    $a3,.L_bn_add_words_return
+       nop
+
+.L_bn_add_words_tail:
+       .set    reorder
+       $LD     $t0,0($a1)
+       $LD     $ta0,0($a2)
+       $ADDU   $ta0,$t0
+       subu    $a3,1
+       sltu    $t8,$ta0,$t0
+       $ADDU   $t0,$ta0,$v0
+       sltu    $v0,$t0,$ta0
+       $ST     $t0,0($a0)
+       $ADDU   $v0,$t8
+       beqz    $a3,.L_bn_add_words_return
+
+       $LD     $t1,$BNSZ($a1)
+       $LD     $ta1,$BNSZ($a2)
+       $ADDU   $ta1,$t1
+       subu    $a3,1
+       sltu    $t9,$ta1,$t1
+       $ADDU   $t1,$ta1,$v0
+       sltu    $v0,$t1,$ta1
+       $ST     $t1,$BNSZ($a0)
+       $ADDU   $v0,$t9
+       beqz    $a3,.L_bn_add_words_return
+
+       $LD     $t2,2*$BNSZ($a1)
+       $LD     $ta2,2*$BNSZ($a2)
+       $ADDU   $ta2,$t2
+       sltu    $t8,$ta2,$t2
+       $ADDU   $t2,$ta2,$v0
+       sltu    $v0,$t2,$ta2
+       $ST     $t2,2*$BNSZ($a0)
+       $ADDU   $v0,$t8
+
+.L_bn_add_words_return:
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       move    $a0,$v0
+
+.end   bn_add_words_internal
+
+.align 5
+.globl bn_sub_words
+.ent   bn_sub_words
+bn_sub_words:
+       .set    noreorder
+       bgtz    $a3,bn_sub_words_internal
+       move    $v0,$zero
+       jr      $ra
+       move    $a0,$zero
+.end   bn_sub_words
+
+.align 5
+.ent   bn_sub_words_internal
+bn_sub_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       li      $minus4,-4
+       and     $at,$a3,$minus4
+       $LD     $t0,0($a1)
+       beqz    $at,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+       $LD     $ta0,0($a2)
+       subu    $a3,4
+       $LD     $t1,$BNSZ($a1)
+       and     $at,$a3,$minus4
+       $LD     $t2,2*$BNSZ($a1)
+       $PTR_ADD $a2,4*$BNSZ
+       $LD     $t3,3*$BNSZ($a1)
+       $PTR_ADD $a0,4*$BNSZ
+       $LD     $ta1,-3*$BNSZ($a2)
+       $PTR_ADD $a1,4*$BNSZ
+       $LD     $ta2,-2*$BNSZ($a2)
+       $LD     $ta3,-$BNSZ($a2)
+       sltu    $t8,$t0,$ta0
+       $SUBU   $ta0,$t0,$ta0
+       $SUBU   $t0,$ta0,$v0
+       sgtu    $v0,$t0,$ta0
+       $ST     $t0,-4*$BNSZ($a0)
+       $ADDU   $v0,$t8
+
+       sltu    $t9,$t1,$ta1
+       $SUBU   $ta1,$t1,$ta1
+       $SUBU   $t1,$ta1,$v0
+       sgtu    $v0,$t1,$ta1
+       $ST     $t1,-3*$BNSZ($a0)
+       $ADDU   $v0,$t9
+
+
+       sltu    $t8,$t2,$ta2
+       $SUBU   $ta2,$t2,$ta2
+       $SUBU   $t2,$ta2,$v0
+       sgtu    $v0,$t2,$ta2
+       $ST     $t2,-2*$BNSZ($a0)
+       $ADDU   $v0,$t8
+
+       sltu    $t9,$t3,$ta3
+       $SUBU   $ta3,$t3,$ta3
+       $SUBU   $t3,$ta3,$v0
+       sgtu    $v0,$t3,$ta3
+       $ST     $t3,-$BNSZ($a0)
+       $ADDU   $v0,$t9
+
+       .set    noreorder
+       bgtzl   $at,.L_bn_sub_words_loop
+       $LD     $t0,0($a1)
+
+       beqz    $a3,.L_bn_sub_words_return
+       nop
+
+.L_bn_sub_words_tail:
+       .set    reorder
+       $LD     $t0,0($a1)
+       $LD     $ta0,0($a2)
+       subu    $a3,1
+       sltu    $t8,$t0,$ta0
+       $SUBU   $ta0,$t0,$ta0
+       $SUBU   $t0,$ta0,$v0
+       sgtu    $v0,$t0,$ta0
+       $ST     $t0,0($a0)
+       $ADDU   $v0,$t8
+       beqz    $a3,.L_bn_sub_words_return
+
+       $LD     $t1,$BNSZ($a1)
+       subu    $a3,1
+       $LD     $ta1,$BNSZ($a2)
+       sltu    $t9,$t1,$ta1
+       $SUBU   $ta1,$t1,$ta1
+       $SUBU   $t1,$ta1,$v0
+       sgtu    $v0,$t1,$ta1
+       $ST     $t1,$BNSZ($a0)
+       $ADDU   $v0,$t9
+       beqz    $a3,.L_bn_sub_words_return
+
+       $LD     $t2,2*$BNSZ($a1)
+       $LD     $ta2,2*$BNSZ($a2)
+       sltu    $t8,$t2,$ta2
+       $SUBU   $ta2,$t2,$ta2
+       $SUBU   $t2,$ta2,$v0
+       sgtu    $v0,$t2,$ta2
+       $ST     $t2,2*$BNSZ($a0)
+       $ADDU   $v0,$t8
+
+.L_bn_sub_words_return:
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       move    $a0,$v0
+.end   bn_sub_words
+
+.align 5
+.globl bn_div_3_words
+.ent   bn_div_3_words
+bn_div_3_words:
+       .set    noreorder
+       move    $a3,$a0         # we know that bn_div_words does not
+                               # touch $a3, $ta2, $ta3 and preserves $a2
+                               # so that we can save two arguments
+                               # and return address in registers
+                               # instead of stack:-)
+                               
+       $LD     $a0,($a3)
+       move    $ta2,$a1
+       bne     $a0,$a2,bn_div_3_words_internal
+       $LD     $a1,-$BNSZ($a3)
+       li      $v0,-1
+       jr      $ra
+       move    $a0,$v0
+.end   bn_div_3_words
+
+.align 5
+.ent   bn_div_3_words_internal
+bn_div_3_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       move    $ta3,$ra
+       bal     bn_div_words
+       move    $ra,$ta3
+       $MULTU  $ta2,$v0
+       $LD     $t2,-2*$BNSZ($a3)
+       move    $ta0,$zero
+       mfhi    $t1
+       mflo    $t0
+       sltu    $t8,$t1,$a1
+.L_bn_div_3_words_inner_loop:
+       bnez    $t8,.L_bn_div_3_words_inner_loop_done
+       sgeu    $at,$t2,$t0
+       seq     $t9,$t1,$a1
+       and     $at,$t9
+       sltu    $t3,$t0,$ta2
+       $ADDU   $a1,$a2
+       $SUBU   $t1,$t3
+       $SUBU   $t0,$ta2
+       sltu    $t8,$t1,$a1
+       sltu    $ta0,$a1,$a2
+       or      $t8,$ta0
+       .set    noreorder
+       beqzl   $at,.L_bn_div_3_words_inner_loop
+       $SUBU   $v0,1
+       .set    reorder
+.L_bn_div_3_words_inner_loop_done:
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       move    $a0,$v0
+.end   bn_div_3_words_internal
+
+.align 5
+.globl bn_div_words
+.ent   bn_div_words
+bn_div_words:
+       .set    noreorder
+       bnez    $a2,bn_div_words_internal
+       li      $v0,-1          # I would rather signal div-by-zero
+                               # which can be done with 'break 7'
+       jr      $ra
+       move    $a0,$v0
+.end   bn_div_words
+
+.align 5
+.ent   bn_div_words_internal
+bn_div_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       move    $v1,$zero
+       bltz    $a2,.L_bn_div_words_body
+       move    $t9,$v1
+       $SLL    $a2,1
+       bgtz    $a2,.-4
+       addu    $t9,1
+
+       .set    reorder
+       negu    $t1,$t9
+       li      $t2,-1
+       $SLL    $t2,$t1
+       and     $t2,$a0
+       $SRL    $at,$a1,$t1
+       .set    noreorder
+       bnezl   $t2,.+8
+       break   6               # signal overflow
+       .set    reorder
+       $SLL    $a0,$t9
+       $SLL    $a1,$t9
+       or      $a0,$at
+___
+$QT=$ta0;
+$HH=$ta1;
+$DH=$v1;
+$code.=<<___;
+.L_bn_div_words_body:
+       $SRL    $DH,$a2,4*$BNSZ # bits
+       sgeu    $at,$a0,$a2
+       .set    noreorder
+       bnezl   $at,.+8
+       $SUBU   $a0,$a2
+       .set    reorder
+
+       li      $QT,-1
+       $SRL    $HH,$a0,4*$BNSZ # bits
+       $SRL    $QT,4*$BNSZ     # q=0xffffffff
+       beq     $DH,$HH,.L_bn_div_words_skip_div1
+       $DIVU   $zero,$a0,$DH
+       mflo    $QT
+.L_bn_div_words_skip_div1:
+       $MULTU  $a2,$QT
+       $SLL    $t3,$a0,4*$BNSZ # bits
+       $SRL    $at,$a1,4*$BNSZ # bits
+       or      $t3,$at
+       mflo    $t0
+       mfhi    $t1
+.L_bn_div_words_inner_loop1:
+       sltu    $t2,$t3,$t0
+       seq     $t8,$HH,$t1
+       sltu    $at,$HH,$t1
+       and     $t2,$t8
+       sltu    $v0,$t0,$a2
+       or      $at,$t2
+       .set    noreorder
+       beqz    $at,.L_bn_div_words_inner_loop1_done
+       $SUBU   $t1,$v0
+       $SUBU   $t0,$a2
+       b       .L_bn_div_words_inner_loop1
+       $SUBU   $QT,1
+       .set    reorder
+.L_bn_div_words_inner_loop1_done:
+
+       $SLL    $a1,4*$BNSZ     # bits
+       $SUBU   $a0,$t3,$t0
+       $SLL    $v0,$QT,4*$BNSZ # bits
+
+       li      $QT,-1
+       $SRL    $HH,$a0,4*$BNSZ # bits
+       $SRL    $QT,4*$BNSZ     # q=0xffffffff
+       beq     $DH,$HH,.L_bn_div_words_skip_div2
+       $DIVU   $zero,$a0,$DH
+       mflo    $QT
+.L_bn_div_words_skip_div2:
+       $MULTU  $a2,$QT
+       $SLL    $t3,$a0,4*$BNSZ # bits
+       $SRL    $at,$a1,4*$BNSZ # bits
+       or      $t3,$at
+       mflo    $t0
+       mfhi    $t1
+.L_bn_div_words_inner_loop2:
+       sltu    $t2,$t3,$t0
+       seq     $t8,$HH,$t1
+       sltu    $at,$HH,$t1
+       and     $t2,$t8
+       sltu    $v1,$t0,$a2
+       or      $at,$t2
+       .set    noreorder
+       beqz    $at,.L_bn_div_words_inner_loop2_done
+       $SUBU   $t1,$v1
+       $SUBU   $t0,$a2
+       b       .L_bn_div_words_inner_loop2
+       $SUBU   $QT,1
+       .set    reorder
+.L_bn_div_words_inner_loop2_done:
+
+       $SUBU   $a0,$t3,$t0
+       or      $v0,$QT
+       $SRL    $v1,$a0,$t9     # $v1 contains remainder if anybody wants it
+       $SRL    $a2,$t9         # restore $a2
+
+       .set    noreorder
+       move    $a1,$v1
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       move    $a0,$v0
+.end   bn_div_words_internal
+___
+undef $HH; undef $QT; undef $DH;
+
+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
+
+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
+
+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
+
+$code.=<<___;
+
+.align 5
+.globl bn_mul_comba8
+.ent   bn_mul_comba8
+bn_mul_comba8:
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,12*$SZREG,$ra
+       .mask   0x803ff008,-$SZREG
+       $PTR_SUB $sp,12*$SZREG
+       $REG_S  $ra,11*$SZREG($sp)
+       $REG_S  $s5,10*$SZREG($sp)
+       $REG_S  $s4,9*$SZREG($sp)
+       $REG_S  $s3,8*$SZREG($sp)
+       $REG_S  $s2,7*$SZREG($sp)
+       $REG_S  $s1,6*$SZREG($sp)
+       $REG_S  $s0,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x003f0000,-$SZREG
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $s5,5*$SZREG($sp)
+       $REG_S  $s4,4*$SZREG($sp)
+       $REG_S  $s3,3*$SZREG($sp)
+       $REG_S  $s2,2*$SZREG($sp)
+       $REG_S  $s1,1*$SZREG($sp)
+       $REG_S  $s0,0*$SZREG($sp)
+___
+$code.=<<___;
+
+       .set    reorder
+       $LD     $a_0,0($a1)     # If compiled with -mips3 option on
+                               # R5000 box assembler barks on this
+                               # 1ine with "should not have mult/div
+                               # as last instruction in bb (R10K
+                               # bug)" warning. If anybody out there
+                               # has a clue about how to circumvent
+                               # this do send me a note.
+                               #               <appro\@fy.chalmers.se>
+
+       $LD     $b_0,0($a2)
+       $LD     $a_1,$BNSZ($a1)
+       $LD     $a_2,2*$BNSZ($a1)
+       $MULTU  $a_0,$b_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+       $LD     $a_3,3*$BNSZ($a1)
+       $LD     $b_1,$BNSZ($a2)
+       $LD     $b_2,2*$BNSZ($a2)
+       $LD     $b_3,3*$BNSZ($a2)
+       mflo    $c_1
+       mfhi    $c_2
+
+       $LD     $a_4,4*$BNSZ($a1)
+       $LD     $a_5,5*$BNSZ($a1)
+       $MULTU  $a_0,$b_1               # mul_add_c(a[0],b[1],c2,c3,c1);
+       $LD     $a_6,6*$BNSZ($a1)
+       $LD     $a_7,7*$BNSZ($a1)
+       $LD     $b_4,4*$BNSZ($a2)
+       $LD     $b_5,5*$BNSZ($a2)
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_1,$b_0               # mul_add_c(a[1],b[0],c2,c3,c1);
+       $ADDU   $c_3,$t_2,$at
+       $LD     $b_6,6*$BNSZ($a2)
+       $LD     $b_7,7*$BNSZ($a2)
+       $ST     $c_1,0($a0)     # r[0]=c1;
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_2,$b_0               # mul_add_c(a[2],b[0],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $c_1,$c_3,$t_2
+       $ST     $c_2,$BNSZ($a0) # r[1]=c2;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_1,$b_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_0,$b_2               # mul_add_c(a[0],b[2],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $c_2,$c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_0,$b_3               # mul_add_c(a[0],b[3],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,2*$BNSZ($a0)       # r[2]=c3;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_1,$b_2               # mul_add_c(a[1],b[2],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $c_3,$c_2,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_2,$b_1               # mul_add_c(a[2],b[1],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_3,$b_0               # mul_add_c(a[3],b[0],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+        $MULTU $a_4,$b_0               # mul_add_c(a[4],b[0],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,3*$BNSZ($a0)       # r[3]=c1;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_3,$b_1               # mul_add_c(a[3],b[1],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $c_1,$c_3,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_2,$b_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_1,$b_3               # mul_add_c(a[1],b[3],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_0,$b_4               # mul_add_c(a[0],b[4],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_0,$b_5               # mul_add_c(a[0],b[5],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,4*$BNSZ($a0)       # r[4]=c2;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_1,$b_4               # mul_add_c(a[1],b[4],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $c_2,$c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_2,$b_3               # mul_add_c(a[2],b[3],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_3,$b_2               # mul_add_c(a[3],b[2],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_4,$b_1               # mul_add_c(a[4],b[1],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_5,$b_0               # mul_add_c(a[5],b[0],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_6,$b_0               # mul_add_c(a[6],b[0],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,5*$BNSZ($a0)       # r[5]=c3;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_5,$b_1               # mul_add_c(a[5],b[1],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $c_3,$c_2,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_4,$b_2               # mul_add_c(a[4],b[2],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_3,$b_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_2,$b_4               # mul_add_c(a[2],b[4],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_1,$b_5               # mul_add_c(a[1],b[5],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_0,$b_6               # mul_add_c(a[0],b[6],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+        $MULTU $a_0,$b_7               # mul_add_c(a[0],b[7],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,6*$BNSZ($a0)       # r[6]=c1;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_1,$b_6               # mul_add_c(a[1],b[6],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $c_1,$c_3,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_2,$b_5               # mul_add_c(a[2],b[5],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_3,$b_4               # mul_add_c(a[3],b[4],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_4,$b_3               # mul_add_c(a[4],b[3],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_5,$b_2               # mul_add_c(a[5],b[2],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_6,$b_1               # mul_add_c(a[6],b[1],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_7,$b_0               # mul_add_c(a[7],b[0],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_7,$b_1               # mul_add_c(a[7],b[1],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,7*$BNSZ($a0)       # r[7]=c2;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_6,$b_2               # mul_add_c(a[6],b[2],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $c_2,$c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_5,$b_3               # mul_add_c(a[5],b[3],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_4,$b_4               # mul_add_c(a[4],b[4],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_3,$b_5               # mul_add_c(a[3],b[5],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_2,$b_6               # mul_add_c(a[2],b[6],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_1,$b_7               # mul_add_c(a[1],b[7],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_2,$b_7               # mul_add_c(a[2],b[7],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,8*$BNSZ($a0)       # r[8]=c3;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_3,$b_6               # mul_add_c(a[3],b[6],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $c_3,$c_2,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_4,$b_5               # mul_add_c(a[4],b[5],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_5,$b_4               # mul_add_c(a[5],b[4],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_6,$b_3               # mul_add_c(a[6],b[3],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_7,$b_2               # mul_add_c(a[7],b[2],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+        $MULTU $a_7,$b_3               # mul_add_c(a[7],b[3],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,9*$BNSZ($a0)       # r[9]=c1;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_6,$b_4               # mul_add_c(a[6],b[4],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $c_1,$c_3,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_5,$b_5               # mul_add_c(a[5],b[5],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_4,$b_6               # mul_add_c(a[4],b[6],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_3,$b_7               # mul_add_c(a[3],b[7],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_4,$b_7               # mul_add_c(a[4],b[7],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,10*$BNSZ($a0)      # r[10]=c2;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_5,$b_6               # mul_add_c(a[5],b[6],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $c_2,$c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_6,$b_5               # mul_add_c(a[6],b[5],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_7,$b_4               # mul_add_c(a[7],b[4],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_7,$b_5               # mul_add_c(a[7],b[5],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,11*$BNSZ($a0)      # r[11]=c3;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_6,$b_6               # mul_add_c(a[6],b[6],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $c_3,$c_2,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_5,$b_7               # mul_add_c(a[5],b[7],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+        $MULTU $a_6,$b_7               # mul_add_c(a[6],b[7],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,12*$BNSZ($a0)      # r[12]=c1;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_7,$b_6               # mul_add_c(a[7],b[6],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $c_1,$c_3,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_7,$b_7               # mul_add_c(a[7],b[7],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,13*$BNSZ($a0)      # r[13]=c2;
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       $ST     $c_3,14*$BNSZ($a0)      # r[14]=c3;
+       $ST     $c_1,15*$BNSZ($a0)      # r[15]=c1;
+
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $s5,10*$SZREG($sp)
+       $REG_L  $s4,9*$SZREG($sp)
+       $REG_L  $s3,8*$SZREG($sp)
+       $REG_L  $s2,7*$SZREG($sp)
+       $REG_L  $s1,6*$SZREG($sp)
+       $REG_L  $s0,5*$SZREG($sp)
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       jr      $ra
+       $PTR_ADD $sp,12*$SZREG
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+       $REG_L  $s5,5*$SZREG($sp)
+       $REG_L  $s4,4*$SZREG($sp)
+       $REG_L  $s3,3*$SZREG($sp)
+       $REG_L  $s2,2*$SZREG($sp)
+       $REG_L  $s1,1*$SZREG($sp)
+       $REG_L  $s0,0*$SZREG($sp)
+       jr      $ra
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+.end   bn_mul_comba8
+
+.align 5
+.globl bn_mul_comba4
+.ent   bn_mul_comba4
+bn_mul_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       $LD     $a_0,0($a1)
+       $LD     $b_0,0($a2)
+       $LD     $a_1,$BNSZ($a1)
+       $LD     $a_2,2*$BNSZ($a1)
+       $MULTU  $a_0,$b_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+       $LD     $a_3,3*$BNSZ($a1)
+       $LD     $b_1,$BNSZ($a2)
+       $LD     $b_2,2*$BNSZ($a2)
+       $LD     $b_3,3*$BNSZ($a2)
+       mflo    $c_1
+       mfhi    $c_2
+       $ST     $c_1,0($a0)
+
+       $MULTU  $a_0,$b_1               # mul_add_c(a[0],b[1],c2,c3,c1);
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_1,$b_0               # mul_add_c(a[1],b[0],c2,c3,c1);
+       $ADDU   $c_3,$t_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_2,$b_0               # mul_add_c(a[2],b[0],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $c_1,$c_3,$t_2
+       $ST     $c_2,$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_1,$b_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_0,$b_2               # mul_add_c(a[0],b[2],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $c_2,$c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_0,$b_3               # mul_add_c(a[0],b[3],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,2*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_1,$b_2               # mul_add_c(a[1],b[2],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $c_3,$c_2,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_2,$b_1               # mul_add_c(a[2],b[1],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $MULTU  $a_3,$b_0               # mul_add_c(a[3],b[0],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+        $MULTU $a_3,$b_1               # mul_add_c(a[3],b[1],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,3*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_2,$b_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $c_1,$c_3,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $MULTU  $a_1,$b_3               # mul_add_c(a[1],b[3],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_2,$b_3               # mul_add_c(a[2],b[3],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,4*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $MULTU  $a_3,$b_2               # mul_add_c(a[3],b[2],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $c_2,$c_1,$t_2
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_3,$b_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,5*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       $ST     $c_1,6*$BNSZ($a0)
+       $ST     $c_2,7*$BNSZ($a0)
+
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       nop
+.end   bn_mul_comba4
+___
+
+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+
+$code.=<<___;
+
+.align 5
+.globl bn_sqr_comba8
+.ent   bn_sqr_comba8
+bn_sqr_comba8:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       $LD     $a_0,0($a1)
+       $LD     $a_1,$BNSZ($a1)
+       $LD     $a_2,2*$BNSZ($a1)
+       $LD     $a_3,3*$BNSZ($a1)
+
+       $MULTU  $a_0,$a_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+       $LD     $a_4,4*$BNSZ($a1)
+       $LD     $a_5,5*$BNSZ($a1)
+       $LD     $a_6,6*$BNSZ($a1)
+       $LD     $a_7,7*$BNSZ($a1)
+       mflo    $c_1
+       mfhi    $c_2
+       $ST     $c_1,0($a0)
+
+       $MULTU  $a_0,$a_1               # mul_add_c2(a[0],b[1],c2,c3,c1);
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_1,$t_2,$zero
+       $SLL    $t_2,1
+        $MULTU $a_2,$a_0               # mul_add_c2(a[2],b[0],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $c_3,$t_2,$at
+       $ST     $c_2,$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_2,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_1,$a_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_0,$a_3               # mul_add_c2(a[0],b[3],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,2*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_3,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_1,$a_2               # mul_add_c2(a[1],b[2],c1,c2,c3);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_3,$at
+        $MULTU $a_4,$a_0               # mul_add_c2(a[4],b[0],c2,c3,c1);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,3*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_1,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_3,$a_1               # mul_add_c2(a[3],b[1],c2,c3,c1);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_1,$at
+       $MULTU  $a_2,$a_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_0,$a_5               # mul_add_c2(a[0],b[5],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,4*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_2,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_1,$a_4               # mul_add_c2(a[1],b[4],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_2,$at
+       $MULTU  $a_2,$a_3               # mul_add_c2(a[2],b[3],c3,c1,c2);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+        $MULTU $a_6,$a_0               # mul_add_c2(a[6],b[0],c1,c2,c3);
+       $ADDU   $c_2,$at
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,5*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_3,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_5,$a_1               # mul_add_c2(a[5],b[1],c1,c2,c3);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_3,$at
+       $MULTU  $a_4,$a_2               # mul_add_c2(a[4],b[2],c1,c2,c3);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_3,$at
+       $MULTU  $a_3,$a_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+        $MULTU $a_0,$a_7               # mul_add_c2(a[0],b[7],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,6*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_1,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_1,$a_6               # mul_add_c2(a[1],b[6],c2,c3,c1);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_1,$at
+       $MULTU  $a_2,$a_5               # mul_add_c2(a[2],b[5],c2,c3,c1);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_1,$at
+       $MULTU  $a_3,$a_4               # mul_add_c2(a[3],b[4],c2,c3,c1);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_1,$at
+        $MULTU $a_7,$a_1               # mul_add_c2(a[7],b[1],c3,c1,c2);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,7*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_2,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_6,$a_2               # mul_add_c2(a[6],b[2],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_2,$at
+       $MULTU  $a_5,$a_3               # mul_add_c2(a[5],b[3],c3,c1,c2);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_2,$at
+       $MULTU  $a_4,$a_4               # mul_add_c(a[4],b[4],c3,c1,c2);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_2,$a_7               # mul_add_c2(a[2],b[7],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,8*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_3,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_3,$a_6               # mul_add_c2(a[3],b[6],c1,c2,c3);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_3,$at
+       $MULTU  $a_4,$a_5               # mul_add_c2(a[4],b[5],c1,c2,c3);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_3,$at
+        $MULTU $a_7,$a_3               # mul_add_c2(a[7],b[3],c2,c3,c1);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,9*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_1,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_6,$a_4               # mul_add_c2(a[6],b[4],c2,c3,c1);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_1,$at
+       $MULTU  $a_5,$a_5               # mul_add_c(a[5],b[5],c2,c3,c1);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_4,$a_7               # mul_add_c2(a[4],b[7],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,10*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_2,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_5,$a_6               # mul_add_c2(a[5],b[6],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_2,$at
+        $MULTU $a_7,$a_5               # mul_add_c2(a[7],b[5],c1,c2,c3);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,11*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_3,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_6,$a_6               # mul_add_c(a[6],b[6],c1,c2,c3);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+        $MULTU $a_6,$a_7               # mul_add_c2(a[6],b[7],c2,c3,c1);
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,12*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_1,$t_2,$zero
+       $SLL    $t_2,1
+        $MULTU $a_7,$a_7               # mul_add_c(a[7],b[7],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,13*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       $ST     $c_3,14*$BNSZ($a0)
+       $ST     $c_1,15*$BNSZ($a0)
+
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       nop
+.end   bn_sqr_comba8
+
+.align 5
+.globl bn_sqr_comba4
+.ent   bn_sqr_comba4
+bn_sqr_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       .frame  $sp,6*$SZREG,$ra
+       .mask   0x8000f008,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,6*$SZREG
+       $REG_S  $ra,5*$SZREG($sp)
+       $REG_S  $t3,4*$SZREG($sp)
+       $REG_S  $t2,3*$SZREG($sp)
+       $REG_S  $t1,2*$SZREG($sp)
+       $REG_S  $t0,1*$SZREG($sp)
+       $REG_S  $gp,0*$SZREG($sp)
+___
+$code.=<<___;
+       .set    reorder
+       $LD     $a_0,0($a1)
+       $LD     $a_1,$BNSZ($a1)
+       $MULTU  $a_0,$a_0               # mul_add_c(a[0],b[0],c1,c2,c3);
+       $LD     $a_2,2*$BNSZ($a1)
+       $LD     $a_3,3*$BNSZ($a1)
+       mflo    $c_1
+       mfhi    $c_2
+       $ST     $c_1,0($a0)
+
+       $MULTU  $a_0,$a_1               # mul_add_c2(a[0],b[1],c2,c3,c1);
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_1,$t_2,$zero
+       $SLL    $t_2,1
+        $MULTU $a_2,$a_0               # mul_add_c2(a[2],b[0],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $c_3,$t_2,$at
+       $ST     $c_2,$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_2,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_1,$a_1               # mul_add_c(a[1],b[1],c3,c1,c2);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+        $MULTU $a_0,$a_3               # mul_add_c2(a[0],b[3],c1,c2,c3);
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,2*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_3,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_1,$a_2               # mul_add_c(a2[1],b[2],c1,c2,c3);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $at,$t_2,$zero
+       $ADDU   $c_3,$at
+        $MULTU $a_3,$a_1               # mul_add_c2(a[3],b[1],c2,c3,c1);
+       $SLL    $t_2,1
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       sltu    $at,$c_2,$t_2
+       $ADDU   $c_3,$at
+       $ST     $c_1,3*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_1,$t_2,$zero
+       $SLL    $t_2,1
+       $MULTU  $a_2,$a_2               # mul_add_c(a[2],b[2],c2,c3,c1);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_2,$t_1
+       sltu    $at,$c_2,$t_1
+        $MULTU $a_2,$a_3               # mul_add_c2(a[2],b[3],c3,c1,c2);
+       $ADDU   $t_2,$at
+       $ADDU   $c_3,$t_2
+       sltu    $at,$c_3,$t_2
+       $ADDU   $c_1,$at
+       $ST     $c_2,4*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       slt     $c_2,$t_2,$zero
+       $SLL    $t_2,1
+        $MULTU $a_3,$a_3               # mul_add_c(a[3],b[3],c1,c2,c3);
+       slt     $a2,$t_1,$zero
+       $ADDU   $t_2,$a2
+       $SLL    $t_1,1
+       $ADDU   $c_3,$t_1
+       sltu    $at,$c_3,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_1,$t_2
+       sltu    $at,$c_1,$t_2
+       $ADDU   $c_2,$at
+       $ST     $c_3,5*$BNSZ($a0)
+
+       mflo    $t_1
+       mfhi    $t_2
+       $ADDU   $c_1,$t_1
+       sltu    $at,$c_1,$t_1
+       $ADDU   $t_2,$at
+       $ADDU   $c_2,$t_2
+       $ST     $c_1,6*$BNSZ($a0)
+       $ST     $c_2,7*$BNSZ($a0)
+
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $t3,4*$SZREG($sp)
+       $REG_L  $t2,3*$SZREG($sp)
+       $REG_L  $t1,2*$SZREG($sp)
+       $REG_L  $t0,1*$SZREG($sp)
+       $REG_L  $gp,0*$SZREG($sp)
+       $PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+       jr      $ra
+       nop
+.end   bn_sqr_comba4
+___
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-mips.pl b/crypto/sha/asm/sha1-mips.pl
new file mode 100644 (file)
index 0000000..f1a702f
--- /dev/null
@@ -0,0 +1,354 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for MIPS.
+
+# Performance improvement is 30% on unaligned input. The "secret" is
+# to deploy lwl/lwr pair to load unaligned input. One could have
+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
+# compatible subroutine. There is room for minor optimization on
+# little-endian platforms...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+       $PTR_ADD="dadd";        # incidentally works even on n32
+       $PTR_SUB="dsub";        # incidentally works even on n32
+       $REG_S="sd";
+       $REG_L="ld";
+       $PTR_SLL="dsll";        # incidentally works even on n32
+       $SZREG=8;
+} else {
+       $PTR_ADD="add";
+       $PTR_SUB="sub";
+       $REG_S="sw";
+       $REG_L="lw";
+       $PTR_SLL="sll";
+       $SZREG=4;
+}
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {  $output=$_ if (/^\w[\w\-]*\.\w+$/);   }
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+            {   $big_endian=(unpack('L',pack('N',1))==1);   }
+
+# offsets of the Most and Least Significant Bytes
+$MSB=$big_endian?0:3;
+$LSB=3&~$MSB;
+
+@X=map("\$$_",(8..23));        # a4-a7,s0-s11
+
+$ctx=$a0;
+$inp=$a1;
+$num=$a2;
+$A="\$1";
+$B="\$2";
+$C="\$3";
+$D="\$7";
+$E="\$24";     @V=($A,$B,$C,$D,$E);
+$t0="\$25";
+$t1=$num;      # $num is offloaded to stack
+$t2="\$30";    # fp
+$K="\$31";     # ra
+
+sub BODY_00_14 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___   if (!$big_endian);
+       srl     $t0,@X[$i],24   # byte swap($i)
+       srl     $t1,@X[$i],8
+       andi    $t2,@X[$i],0xFF00
+       sll     @X[$i],@X[$i],24
+       andi    $t1,0xFF00
+       sll     $t2,$t2,8
+       or      @X[$i],$t0
+       or      $t1,$t2
+       or      @X[$i],$t1
+___
+$code.=<<___;
+        lwl    @X[$j],$j*4+$MSB($inp)
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+        lwr    @X[$j],$j*4+$LSB($inp)
+       srl     $t1,$a,27
+       addu    $e,$t0
+       xor     $t0,$c,$d
+       addu    $e,$t1
+       sll     $t2,$b,30
+       and     $t0,$b
+       srl     $b,$b,2
+       xor     $t0,$d
+       addu    $e,@X[$i]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+sub BODY_15_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___   if (!$big_endian && $i==15);
+       srl     $t0,@X[$i],24   # byte swap($i)
+       srl     $t1,@X[$i],8
+       andi    $t2,@X[$i],0xFF00
+       sll     @X[$i],@X[$i],24
+       andi    $t1,0xFF00
+       sll     $t2,$t2,8
+       or      @X[$i],$t0
+       or      @X[$i],$t1
+       or      @X[$i],$t2
+___
+$code.=<<___;
+        xor    @X[$j%16],@X[($j+2)%16]
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+       srl     $t1,$a,27
+       addu    $e,$t0
+        xor    @X[$j%16],@X[($j+8)%16]
+       xor     $t0,$c,$d
+       addu    $e,$t1
+        xor    @X[$j%16],@X[($j+13)%16]
+       sll     $t2,$b,30
+       and     $t0,$b
+        srl    $t1,@X[$j%16],31
+        addu   @X[$j%16],@X[$j%16]
+       srl     $b,$b,2
+       xor     $t0,$d
+        or     @X[$j%16],$t1
+       addu    $e,@X[$i%16]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+        xor    @X[$j%16],@X[($j+2)%16]
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+       srl     $t1,$a,27
+       addu    $e,$t0
+        xor    @X[$j%16],@X[($j+8)%16]
+       xor     $t0,$c,$d
+       addu    $e,$t1
+        xor    @X[$j%16],@X[($j+13)%16]
+       sll     $t2,$b,30
+       xor     $t0,$b
+        srl    $t1,@X[$j%16],31
+        addu   @X[$j%16],@X[$j%16]
+       srl     $b,$b,2
+       addu    $e,@X[$i%16]
+        or     @X[$j%16],$t1
+       or      $b,$t2
+       addu    $e,$t0
+___
+$code.=<<___ if ($i==79);
+        lw     @X[0],0($ctx)
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+        lw     @X[1],4($ctx)
+       srl     $t1,$a,27
+       addu    $e,$t0
+        lw     @X[2],8($ctx)
+       xor     $t0,$c,$d
+       addu    $e,$t1
+        lw     @X[3],12($ctx)
+       sll     $t2,$b,30
+       xor     $t0,$b
+        lw     @X[4],16($ctx)
+       srl     $b,$b,2
+       addu    $e,@X[$i%16]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+        xor    @X[$j%16],@X[($j+2)%16]
+       sll     $t0,$a,5        # $i
+       addu    $e,$K
+       srl     $t1,$a,27
+       addu    $e,$t0
+        xor    @X[$j%16],@X[($j+8)%16]
+       and     $t0,$c,$d
+       addu    $e,$t1
+        xor    @X[$j%16],@X[($j+13)%16]
+       sll     $t2,$b,30
+       addu    $e,$t0
+        srl    $t1,@X[$j%16],31
+       xor     $t0,$c,$d
+        addu   @X[$j%16],@X[$j%16]
+       and     $t0,$b
+       srl     $b,$b,2
+        or     @X[$j%16],$t1
+       addu    $e,@X[$i%16]
+       or      $b,$t2
+       addu    $e,$t0
+___
+}
+
+$FRAMESIZE=16; # large enough to accomodate NUBI saved registers
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+
+.set   noat
+.set   noreorder
+.align 5
+.globl sha1_block_data_order
+.ent   sha1_block_data_order
+sha1_block_data_order:
+       .frame  $sp,$FRAMESIZE*$SZREG,$ra
+       .mask   $SAVED_REGS_MASK,-$SZREG
+       .set    noreorder
+       $PTR_SUB $sp,$FRAMESIZE*$SZREG
+       $REG_S  $ra,($FRAMESIZE-1)*$SZREG($sp)
+       $REG_S  $fp,($FRAMESIZE-2)*$SZREG($sp)
+       $REG_S  $s11,($FRAMESIZE-3)*$SZREG($sp)
+       $REG_S  $s10,($FRAMESIZE-4)*$SZREG($sp)
+       $REG_S  $s9,($FRAMESIZE-5)*$SZREG($sp)
+       $REG_S  $s8,($FRAMESIZE-6)*$SZREG($sp)
+       $REG_S  $s7,($FRAMESIZE-7)*$SZREG($sp)
+       $REG_S  $s6,($FRAMESIZE-8)*$SZREG($sp)
+       $REG_S  $s5,($FRAMESIZE-9)*$SZREG($sp)
+       $REG_S  $s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+       $REG_S  $s3,($FRAMESIZE-11)*$SZREG($sp)
+       $REG_S  $s2,($FRAMESIZE-12)*$SZREG($sp)
+       $REG_S  $s1,($FRAMESIZE-13)*$SZREG($sp)
+       $REG_S  $s0,($FRAMESIZE-14)*$SZREG($sp)
+       $REG_S  $gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+       $PTR_SLL $num,6
+       $PTR_ADD $num,$inp
+       $REG_S  $num,0($sp)
+       lw      $A,0($ctx)
+       lw      $B,4($ctx)
+       lw      $C,8($ctx)
+       lw      $D,12($ctx)
+       b       .Loop
+       lw      $E,16($ctx)
+.align 4
+.Loop:
+       .set    reorder
+       lwl     @X[0],$MSB($inp)
+       lui     $K,0x5a82
+       lwr     @X[0],$LSB($inp)
+       ori     $K,0x7999       # K_00_19
+___
+for ($i=0;$i<15;$i++)  { &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++)      { &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       lui     $K,0x6ed9
+       ori     $K,0xeba1       # K_20_39
+___
+for (;$i<40;$i++)      { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       lui     $K,0x8f1b
+       ori     $K,0xbcdc       # K_40_59
+___
+for (;$i<60;$i++)      { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       lui     $K,0xca62
+       ori     $K,0xc1d6       # K_60_79
+___
+for (;$i<80;$i++)      { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+       $PTR_ADD $inp,64
+       $REG_L  $num,0($sp)
+
+       addu    $A,$X[0]
+       addu    $B,$X[1]
+       sw      $A,0($ctx)
+       addu    $C,$X[2]
+       addu    $D,$X[3]
+       sw      $B,4($ctx)
+       addu    $E,$X[4]
+       sw      $C,8($ctx)
+       sw      $D,12($ctx)
+       sw      $E,16($ctx)
+       .set    noreorder
+       bne     $inp,$num,.Loop
+       nop
+
+       .set    noreorder
+       $REG_L  $ra,($FRAMESIZE-1)*$SZREG($sp)
+       $REG_L  $fp,($FRAMESIZE-2)*$SZREG($sp)
+       $REG_L  $s11,($FRAMESIZE-3)*$SZREG($sp)
+       $REG_L  $s10,($FRAMESIZE-4)*$SZREG($sp)
+       $REG_L  $s9,($FRAMESIZE-5)*$SZREG($sp)
+       $REG_L  $s8,($FRAMESIZE-6)*$SZREG($sp)
+       $REG_L  $s7,($FRAMESIZE-7)*$SZREG($sp)
+       $REG_L  $s6,($FRAMESIZE-8)*$SZREG($sp)
+       $REG_L  $s5,($FRAMESIZE-9)*$SZREG($sp)
+       $REG_L  $s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $s3,($FRAMESIZE-11)*$SZREG($sp)
+       $REG_L  $s2,($FRAMESIZE-12)*$SZREG($sp)
+       $REG_L  $s1,($FRAMESIZE-13)*$SZREG($sp)
+       $REG_L  $s0,($FRAMESIZE-14)*$SZREG($sp)
+       $REG_L  $gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+       jr      $ra
+       $PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end   sha1_block_data_order
+.rdata
+.asciiz        "SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl
new file mode 100644 (file)
index 0000000..ba5b250
--- /dev/null
@@ -0,0 +1,455 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA2 block procedures for MIPS.
+
+# October 2010.
+#
+# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
+# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
+# for now can only be compiled for MIPS64 ISA] improvement is modest
+# ~17%, but it comes for free, because it's same instruction sequence.
+# Improvement coefficients are for aligned input.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+       $PTR_ADD="dadd";        # incidentally works even on n32
+       $PTR_SUB="dsub";        # incidentally works even on n32
+       $REG_S="sd";
+       $REG_L="ld";
+       $PTR_SLL="dsll";        # incidentally works even on n32
+       $SZREG=8;
+} else {
+       $PTR_ADD="add";
+       $PTR_SUB="sub";
+       $REG_S="sw";
+       $REG_L="lw";
+       $PTR_SLL="sll";
+       $SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {  $output=$_ if (/^\w[\w\-]*\.\w+$/);     }
+open STDOUT,">$output";
+
+if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
+
+if ($output =~ /512/) {
+       $label="512";
+       $SZ=8;
+       $LD="ld";               # load from memory
+       $ST="sd";               # store to memory
+       $SLL="dsll";            # shift left logical
+       $SRL="dsrl";            # shift right logical
+       $ADDU="daddu";
+       @Sigma0=(28,34,39);
+       @Sigma1=(14,18,41);
+       @sigma0=( 7, 1, 8);     # right shift first
+       @sigma1=( 6,19,61);     # right shift first
+       $lastK=0x817;
+       $rounds=80;
+} else {
+       $label="256";
+       $SZ=4;
+       $LD="lw";               # load from memory
+       $ST="sw";               # store to memory
+       $SLL="sll";             # shift left logical
+       $SRL="srl";             # shift right logical
+       $ADDU="addu";
+       @Sigma0=( 2,13,22);
+       @Sigma1=( 6,11,25);
+       @sigma0=( 3, 7,18);     # right shift first
+       @sigma1=(10,17,19);     # right shift first
+       $lastK=0x8f2;
+       $rounds=64;
+}
+
+$MSB = $big_endian ? 0 : ($SZ-1);
+$LSB = ($SZ-1)&~$MSB;
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
+@X=map("\$$_",(8..23));
+
+$ctx=$a0;
+$inp=$a1;
+$len=$a2;      $Ktbl=$len;
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___ if ($i<15);
+       ${LD}l  @X[1],`($i+1)*$SZ+$MSB`($inp)
+       ${LD}r  @X[1],`($i+1)*$SZ+$LSB`($inp)
+___
+$code.=<<___   if (!$big_endian && $i<16 && $SZ==4);
+       srl     $tmp0,@X[0],24          # byte swap($i)
+       srl     $tmp1,@X[0],8
+       andi    $tmp2,@X[0],0xFF00
+       sll     @X[0],@X[0],24
+       andi    $tmp1,0xFF00
+       sll     $tmp2,$tmp2,8
+       or      @X[0],$tmp0
+       or      $tmp1,$tmp2
+       or      @X[0],$tmp1
+___
+$code.=<<___   if (!$big_endian && $i<16 && $SZ==8);
+       ori     $tmp0,$zero,0xFF
+       dsll    $tmp2,$tmp0,32
+       or      $tmp0,$tmp2             # 0x000000FF000000FF
+       and     $tmp1,@X[0],$tmp0       # byte swap($i)
+       dsrl    $tmp2,@X[0],24
+       dsll    $tmp1,24
+       and     $tmp2,$tmp0
+       dsll    $tmp0,8                 # 0x0000FF000000FF00
+       or      $tmp1,$tmp2
+       and     $tmp2,@X[0],$tmp0
+       dsrl    @X[0],8
+       dsll    $tmp2,8
+       and     @X[0],$tmp0
+       or      $tmp1,$tmp2
+       or      @X[0],$tmp1
+       dsrl    $tmp1,@X[0],32
+       dsll    @X[0],32
+       or      @X[0],$tmp1
+___
+$code.=<<___;
+       $ADDU   $T1,$X[0],$h                    # $i
+       $SRL    $h,$e,@Sigma1[0]
+       xor     $tmp2,$f,$g
+       $SLL    $tmp1,$e,`$SZ*8-@Sigma1[2]`
+       and     $tmp2,$e
+       $SRL    $tmp0,$e,@Sigma1[1]
+       xor     $h,$tmp1
+       $SLL    $tmp1,$e,`$SZ*8-@Sigma1[1]`
+       xor     $h,$tmp0
+       $SRL    $tmp0,$e,@Sigma1[2]
+       xor     $h,$tmp1
+       $SLL    $tmp1,$e,`$SZ*8-@Sigma1[0]`
+       xor     $h,$tmp0
+       xor     $tmp2,$g                        # Ch(e,f,g)
+       xor     $tmp0,$tmp1,$h                  # Sigma1(e)
+
+       $SRL    $h,$a,@Sigma0[0]
+       $ADDU   $T1,$tmp2
+       $LD     $tmp2,`$i*$SZ`($Ktbl)           # K[$i]
+       $SLL    $tmp1,$a,`$SZ*8-@Sigma0[2]`
+       $ADDU   $T1,$tmp0
+       $SRL    $tmp0,$a,@Sigma0[1]
+       xor     $h,$tmp1
+       $SLL    $tmp1,$a,`$SZ*8-@Sigma0[1]`
+       xor     $h,$tmp0
+       $SRL    $tmp0,$a,@Sigma0[2]
+       xor     $h,$tmp1
+       $SLL    $tmp1,$a,`$SZ*8-@Sigma0[0]`
+       xor     $h,$tmp0
+       $ST     @X[0],`($i%16)*$SZ`($sp)        # offload to ring buffer
+       xor     $h,$tmp1                        # Sigma0(a)
+
+       or      $tmp0,$a,$b
+       and     $tmp1,$a,$b
+       and     $tmp0,$c
+       or      $tmp1,$tmp0                     # Maj(a,b,c)
+       $ADDU   $T1,$tmp2                       # +=K[$i]
+       $ADDU   $h,$tmp1
+
+       $ADDU   $d,$T1
+       $ADDU   $h,$T1
+___
+$code.=<<___ if ($i>=13);
+       $LD     @X[3],`(($i+3)%16)*$SZ`($sp)    # prefetch from ring buffer
+___
+}
+
+sub BODY_16_XX {
+my $i=@_[0];
+my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___;
+       $SRL    $tmp2,@X[1],@sigma0[0]          # Xupdate($i)
+       $ADDU   @X[0],@X[9]                     # +=X[i+9]
+       $SLL    $tmp1,@X[1],`$SZ*8-@sigma0[2]`
+       $SRL    $tmp0,@X[1],@sigma0[1]
+       xor     $tmp2,$tmp1
+       $SLL    $tmp1,`@sigma0[2]-@sigma0[1]`
+       xor     $tmp2,$tmp0
+       $SRL    $tmp0,@X[1],@sigma0[2]
+       xor     $tmp2,$tmp1
+
+       $SRL    $tmp3,@X[14],@sigma1[0]
+       xor     $tmp2,$tmp0                     # sigma0(X[i+1])
+       $SLL    $tmp1,@X[14],`$SZ*8-@sigma1[2]`
+       $ADDU   @X[0],$tmp2
+       $SRL    $tmp0,@X[14],@sigma1[1]
+       xor     $tmp3,$tmp1
+       $SLL    $tmp1,`@sigma1[2]-@sigma1[1]`
+       xor     $tmp3,$tmp0
+       $SRL    $tmp0,@X[14],@sigma1[2]
+       xor     $tmp3,$tmp1
+
+       xor     $tmp3,$tmp0                     # sigma1(X[i+14])
+       $ADDU   @X[0],$tmp3
+___
+       &BODY_00_15(@_);
+}
+
+$FRAMESIZE=16*$SZ+16*$SZREG;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code.=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+.set   noat
+#if !defined(__vxworks) || defined(__pic__)
+.option        pic2
+#endif
+
+.align 5
+.globl sha${label}_block_data_order
+.ent   sha${label}_block_data_order
+sha${label}_block_data_order:
+       .frame  $sp,$FRAMESIZE,$ra
+       .mask   $SAVED_REGS_MASK,-$SZREG
+       .set    noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);  # o32 PIC-ification
+       .cpload $pf
+___
+$code.=<<___;
+       $PTR_SUB $sp,$FRAMESIZE
+       $REG_S  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_S  $fp,$FRAMESIZE-2*$SZREG($sp)
+       $REG_S  $s11,$FRAMESIZE-3*$SZREG($sp)
+       $REG_S  $s10,$FRAMESIZE-4*$SZREG($sp)
+       $REG_S  $s9,$FRAMESIZE-5*$SZREG($sp)
+       $REG_S  $s8,$FRAMESIZE-6*$SZREG($sp)
+       $REG_S  $s7,$FRAMESIZE-7*$SZREG($sp)
+       $REG_S  $s6,$FRAMESIZE-8*$SZREG($sp)
+       $REG_S  $s5,$FRAMESIZE-9*$SZREG($sp)
+       $REG_S  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
+       $REG_S  $s3,$FRAMESIZE-11*$SZREG($sp)
+       $REG_S  $s2,$FRAMESIZE-12*$SZREG($sp)
+       $REG_S  $s1,$FRAMESIZE-13*$SZREG($sp)
+       $REG_S  $s0,$FRAMESIZE-14*$SZREG($sp)
+       $REG_S  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+       $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
+___
+$code.=<<___ if ($flavour !~ /o32/i);  # non-o32 PIC-ification
+       .cplocal        $Ktbl
+       .cpsetup        $pf,$zero,sha${label}_block_data_order
+___
+$code.=<<___;
+       .set    reorder
+       la      $Ktbl,K${label}         # PIC-ified 'load address'
+
+       $LD     $A,0*$SZ($ctx)          # load context
+       $LD     $B,1*$SZ($ctx)
+       $LD     $C,2*$SZ($ctx)
+       $LD     $D,3*$SZ($ctx)
+       $LD     $E,4*$SZ($ctx)
+       $LD     $F,5*$SZ($ctx)
+       $LD     $G,6*$SZ($ctx)
+       $LD     $H,7*$SZ($ctx)
+
+       $PTR_ADD @X[15],$inp            # pointer to the end of input
+       $REG_S  @X[15],16*$SZ($sp)
+       b       .Loop
+
+.align 5
+.Loop:
+       ${LD}l  @X[0],$MSB($inp)
+       ${LD}r  @X[0],$LSB($inp)
+___
+for ($i=0;$i<16;$i++)
+{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+       b       .L16_xx
+.align 4
+.L16_xx:
+___
+for (;$i<32;$i++)
+{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+       and     @X[6],0xfff
+       li      @X[7],$lastK
+       .set    noreorder
+       bne     @X[6],@X[7],.L16_xx
+       $PTR_ADD $Ktbl,16*$SZ           # Ktbl+=16
+
+       $REG_L  @X[15],16*$SZ($sp)      # restore pointer to the end of input
+       $LD     @X[0],0*$SZ($ctx)
+       $LD     @X[1],1*$SZ($ctx)
+       $LD     @X[2],2*$SZ($ctx)
+       $PTR_ADD $inp,16*$SZ
+       $LD     @X[3],3*$SZ($ctx)
+       $ADDU   $A,@X[0]
+       $LD     @X[4],4*$SZ($ctx)
+       $ADDU   $B,@X[1]
+       $LD     @X[5],5*$SZ($ctx)
+       $ADDU   $C,@X[2]
+       $LD     @X[6],6*$SZ($ctx)
+       $ADDU   $D,@X[3]
+       $LD     @X[7],7*$SZ($ctx)
+       $ADDU   $E,@X[4]
+       $ST     $A,0*$SZ($ctx)
+       $ADDU   $F,@X[5]
+       $ST     $B,1*$SZ($ctx)
+       $ADDU   $G,@X[6]
+       $ST     $C,2*$SZ($ctx)
+       $ADDU   $H,@X[7]
+       $ST     $D,3*$SZ($ctx)
+       $ST     $E,4*$SZ($ctx)
+       $ST     $F,5*$SZ($ctx)
+       $ST     $G,6*$SZ($ctx)
+       $ST     $H,7*$SZ($ctx)
+
+       bnel    $inp,@X[15],.Loop
+       $PTR_SUB $Ktbl,`($rounds-16)*$SZ`       # rewind $Ktbl
+
+       $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
+       $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
+       $REG_L  $s11,$FRAMESIZE-3*$SZREG($sp)
+       $REG_L  $s10,$FRAMESIZE-4*$SZREG($sp)
+       $REG_L  $s9,$FRAMESIZE-5*$SZREG($sp)
+       $REG_L  $s8,$FRAMESIZE-6*$SZREG($sp)
+       $REG_L  $s7,$FRAMESIZE-7*$SZREG($sp)
+       $REG_L  $s6,$FRAMESIZE-8*$SZREG($sp)
+       $REG_L  $s5,$FRAMESIZE-9*$SZREG($sp)
+       $REG_L  $s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+       $REG_L  $s3,$FRAMESIZE-11*$SZREG($sp)
+       $REG_L  $s2,$FRAMESIZE-12*$SZREG($sp)
+       $REG_L  $s1,$FRAMESIZE-13*$SZREG($sp)
+       $REG_L  $s0,$FRAMESIZE-14*$SZREG($sp)
+       $REG_L  $gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+       jr      $ra
+       $PTR_ADD $sp,$FRAMESIZE
+.end   sha${label}_block_data_order
+
+.rdata
+.align 5
+K${label}:
+___
+if ($SZ==4) {
+$code.=<<___;
+       .word   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .word   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .word   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .word   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .word   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .word   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .word   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .word   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .word   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .word   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .word   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .word   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .word   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .word   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .word   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .word   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+       .dword  0x428a2f98d728ae22, 0x7137449123ef65cd
+       .dword  0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+       .dword  0x3956c25bf348b538, 0x59f111f1b605d019
+       .dword  0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+       .dword  0xd807aa98a3030242, 0x12835b0145706fbe
+       .dword  0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+       .dword  0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+       .dword  0x9bdc06a725c71235, 0xc19bf174cf692694
+       .dword  0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+       .dword  0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+       .dword  0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+       .dword  0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+       .dword  0x983e5152ee66dfab, 0xa831c66d2db43210
+       .dword  0xb00327c898fb213f, 0xbf597fc7beef0ee4
+       .dword  0xc6e00bf33da88fc2, 0xd5a79147930aa725
+       .dword  0x06ca6351e003826f, 0x142929670a0e6e70
+       .dword  0x27b70a8546d22ffc, 0x2e1b21385c26c926
+       .dword  0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+       .dword  0x650a73548baf63de, 0x766a0abb3c77b2a8
+       .dword  0x81c2c92e47edaee6, 0x92722c851482353b
+       .dword  0xa2bfe8a14cf10364, 0xa81a664bbc423001
+       .dword  0xc24b8b70d0f89791, 0xc76c51a30654be30
+       .dword  0xd192e819d6ef5218, 0xd69906245565a910
+       .dword  0xf40e35855771202a, 0x106aa07032bbd1b8
+       .dword  0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+       .dword  0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+       .dword  0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+       .dword  0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+       .dword  0x748f82ee5defb2fc, 0x78a5636f43172f60
+       .dword  0x84c87814a1f0ab72, 0x8cc702081a6439ec
+       .dword  0x90befffa23631e28, 0xa4506cebde82bde9
+       .dword  0xbef9a3f7b2c67915, 0xc67178f2e372532b
+       .dword  0xca273eceea26619c, 0xd186b8c721c0c207
+       .dword  0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+       .dword  0x06f067aa72176fba, 0x0a637dc5a2c898a6
+       .dword  0x113f9804bef90dae, 0x1b710b35131c471b
+       .dword  0x28db77f523047d84, 0x32caab7b40c72493
+       .dword  0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+       .dword  0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+       .dword  0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+___
+}
+$code.=<<___;
+.asciiz        "SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+.align 5
+
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;