# has to content with 40-85% improvement depending on benchmark and
# key length, more for longer keys.
-$flavour = shift;
+$flavour = shift || "o32";
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
- $LD $t0,0($a1)
beqz $ta0,.L_bn_mul_add_words_tail
.L_bn_mul_add_words_loop:
+ $LD $t0,0($a1)
$MULTU $t0,$a3
$LD $t1,0($a0)
$LD $t2,$BNSZ($a1)
$ADDU $v0,$ta2
sltu $at,$ta3,$at
$ST $ta3,-$BNSZ($a0)
- $ADDU $v0,$at
.set noreorder
- bgtzl $ta0,.L_bn_mul_add_words_loop
- $LD $t0,0($a1)
+ bgtz $ta0,.L_bn_mul_add_words_loop
+ $ADDU $v0,$at
beqz $a2,.L_bn_mul_add_words_return
nop
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
- $LD $t0,0($a1)
beqz $ta0,.L_bn_mul_words_tail
.L_bn_mul_words_loop:
+ $LD $t0,0($a1)
$MULTU $t0,$a3
$LD $t2,$BNSZ($a1)
$LD $ta0,2*$BNSZ($a1)
$ADDU $v0,$at
sltu $ta3,$v0,$at
$ST $v0,-$BNSZ($a0)
- $ADDU $v0,$ta3,$ta2
.set noreorder
- bgtzl $ta0,.L_bn_mul_words_loop
- $LD $t0,0($a1)
+ bgtz $ta0,.L_bn_mul_words_loop
+ $ADDU $v0,$ta3,$ta2
beqz $a2,.L_bn_mul_words_return
nop
.set reorder
li $minus4,-4
and $ta0,$a2,$minus4
- $LD $t0,0($a1)
beqz $ta0,.L_bn_sqr_words_tail
.L_bn_sqr_words_loop:
+ $LD $t0,0($a1)
$MULTU $t0,$t0
$LD $t2,$BNSZ($a1)
$LD $ta0,2*$BNSZ($a1)
mflo $ta3
mfhi $ta2
$ST $ta3,-2*$BNSZ($a0)
- $ST $ta2,-$BNSZ($a0)
.set noreorder
- bgtzl $ta0,.L_bn_sqr_words_loop
- $LD $t0,0($a1)
+ bgtz $ta0,.L_bn_sqr_words_loop
+ $ST $ta2,-$BNSZ($a0)
beqz $a2,.L_bn_sqr_words_return
nop
.set reorder
li $minus4,-4
and $at,$a3,$minus4
- $LD $t0,0($a1)
beqz $at,.L_bn_add_words_tail
.L_bn_add_words_loop:
+ $LD $t0,0($a1)
$LD $ta0,0($a2)
subu $a3,4
$LD $t1,$BNSZ($a1)
$ADDU $t3,$ta3,$v0
sltu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
- $ADDU $v0,$t9
.set noreorder
- bgtzl $at,.L_bn_add_words_loop
- $LD $t0,0($a1)
+ bgtz $at,.L_bn_add_words_loop
+ $ADDU $v0,$t9
beqz $a3,.L_bn_add_words_return
nop
.set reorder
li $minus4,-4
and $at,$a3,$minus4
- $LD $t0,0($a1)
beqz $at,.L_bn_sub_words_tail
.L_bn_sub_words_loop:
+ $LD $t0,0($a1)
$LD $ta0,0($a2)
subu $a3,4
$LD $t1,$BNSZ($a1)
$SUBU $t3,$ta3,$v0
sgtu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
- $ADDU $v0,$t9
.set noreorder
- bgtzl $at,.L_bn_sub_words_loop
- $LD $t0,0($a1)
+ bgtz $at,.L_bn_sub_words_loop
+ $ADDU $v0,$t9
beqz $a3,.L_bn_sub_words_return
nop
$code.=<<___;
.set reorder
move $ta3,$ra
- bal bn_div_words
+ bal bn_div_words_internal
move $ra,$ta3
$MULTU $ta2,$v0
$LD $t2,-2*$BNSZ($a3)
sltu $ta0,$a1,$a2
or $t8,$ta0
.set noreorder
- beqzl $at,.L_bn_div_3_words_inner_loop
+ beqz $at,.L_bn_div_3_words_inner_loop
$SUBU $v0,1
+ $ADDU $v0,1
.set reorder
.L_bn_div_3_words_inner_loop_done:
.set noreorder
and $t2,$a0
$SRL $at,$a1,$t1
.set noreorder
- bnezl $t2,.+8
+ beqz $t2,.+12
+ nop
break 6 # signal overflow
.set reorder
$SLL $a0,$t9
$SRL $DH,$a2,4*$BNSZ # bits
sgeu $at,$a0,$a2
.set noreorder
- bnezl $at,.+8
+ beqz $at,.+12
+ nop
$SUBU $a0,$a2
.set reorder
($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+sub add_c2 () {
+my ($hi,$lo,$c0,$c1,$c2,
+ $warm, # !$warm denotes first call with specific sequence of
+ # $c_[XYZ] when there is no Z-carry to accumulate yet;
+ $an,$bn # these two are arguments for multiplication which
+ # result is used in *next* step [which is why it's
+ # commented as "forward multiplication" below];
+ )=@_;
+$code.=<<___;
+ mflo $lo
+ mfhi $hi
+ $ADDU $c0,$lo
+ sltu $at,$c0,$lo
+ $MULTU $an,$bn # forward multiplication
+ $ADDU $c0,$lo
+ $ADDU $at,$hi
+ sltu $lo,$c0,$lo
+ $ADDU $c1,$at
+ $ADDU $hi,$lo
+___
+$code.=<<___ if (!$warm);
+ sltu $c2,$c1,$at
+ $ADDU $c1,$hi
+ sltu $hi,$c1,$hi
+ $ADDU $c2,$hi
+___
+$code.=<<___ if ($warm);
+ sltu $at,$c1,$at
+ $ADDU $c1,$hi
+ $ADDU $c2,$at
+ sltu $hi,$c1,$hi
+ $ADDU $c2,$hi
+___
+}
+
$code.=<<___;
.align 5
sltu $at,$c_2,$t_1
$ADDU $c_3,$t_2,$at
$ST $c_2,$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,2*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
+$code.=<<___;
$ST $c_1,3*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,4*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3);
- $ADDU $c_2,$at
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
+$code.=<<___;
$ST $c_3,5*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_1,$t_1
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
$ST $c_1,6*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
+$code.=<<___;
$ST $c_2,7*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,8*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
+$code.=<<___;
$ST $c_1,9*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_1,$at
- $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+ $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,10*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_2,$at
- $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+ $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
+$code.=<<___;
$ST $c_3,11*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_1,$t_1
sltu $at,$c_2,$t_2
$ADDU $c_3,$at
$ST $c_1,12*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
+$code.=<<___;
$ST $c_2,13*$BNSZ($a0)
mflo $t_1
sltu $at,$c_2,$t_1
$ADDU $c_3,$t_2,$at
$ST $c_2,$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_3,$t_1
sltu $at,$c_1,$t_2
$ADDU $c_2,$at
$ST $c_3,2*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_3,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
- mflo $t_1
- mfhi $t_2
- slt $at,$t_2,$zero
- $ADDU $c_3,$at
- $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1);
- $SLL $t_2,1
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_1,$t_1
- sltu $at,$c_1,$t_1
- $ADDU $t_2,$at
- $ADDU $c_2,$t_2
- sltu $at,$c_2,$t_2
- $ADDU $c_3,$at
+___
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+ $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
+ &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+ $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
+$code.=<<___;
$ST $c_1,3*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_1,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_2,$t_1
- sltu $at,$c_2,$t_1
- $ADDU $t_2,$at
- $ADDU $c_3,$t_2
- sltu $at,$c_3,$t_2
- $ADDU $c_1,$at
+___
+ &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+ $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
mflo $t_1
mfhi $t_2
$ADDU $c_2,$t_1
sltu $at,$c_3,$t_2
$ADDU $c_1,$at
$ST $c_2,4*$BNSZ($a0)
-
- mflo $t_1
- mfhi $t_2
- slt $c_2,$t_2,$zero
- $SLL $t_2,1
- $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3);
- slt $a2,$t_1,$zero
- $ADDU $t_2,$a2
- $SLL $t_1,1
- $ADDU $c_3,$t_1
- sltu $at,$c_3,$t_1
- $ADDU $t_2,$at
- $ADDU $c_1,$t_2
- sltu $at,$c_1,$t_2
- $ADDU $c_2,$at
+___
+ &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+ $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
$ST $c_3,5*$BNSZ($a0)
mflo $t_1