sha/asm/keccak1600-avx512.pl: absorb bug-fix and minor optimization.

[oweals/openssl.git] / crypto / sha / asm / sha512-mips.pl
diff --git a/crypto/sha/asm/sha512-mips.pl b/crypto/sha/asm/sha512-mips.pl

index 2c70663f98ab292bd8ed4d1d43e652a02b8d6fd3..e6fd2687f887c1c7d5d340860002ce4f3aa4398a 100644 (file)
--- a/crypto/sha/asm/sha512-mips.pl
+++ b/crypto/sha/asm/sha512-mips.pl
@@ -1,7 +1,14 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  
  # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  # project. The module is, however, dual licensed under OpenSSL and
  # CRYPTOGAMS licenses depending on where you obtain it. For further
  # details see http://www.openssl.org/~appro/cryptogams/.
@@ -17,6 +24,10 @@
  # ~17%, but it comes for free, because it's same instruction sequence.
  # Improvement coefficients are for aligned input.
  
+# September 2012.
+#
+# Add MIPS[32|64]R2 code (>25% less instructions).
+
  ######################################################################
  # There is a number of MIPS ABI in use, O32 and N32/64 are most
  # widely used. Then there is a new contender: NUBI. It appears that if
@@ -31,7 +42,8 @@
  # The return value is placed in $a0. Following coding rules facilitate
  # interoperability:
  #
-# - never ever touch $tp, "thread pointer", former $gp;
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
  # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  #   old code];
  # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
@@ -44,32 +56,34 @@
  # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
  
  if ($flavour =~ /64|n32/i) {
-       $PTR_ADD="dadd";        # incidentally works even on n32
-       $PTR_SUB="dsub";        # incidentally works even on n32
+       $PTR_LA="dla";
+       $PTR_ADD="daddu";       # incidentally works even on n32
+       $PTR_SUB="dsubu";       # incidentally works even on n32
         $REG_S="sd";
         $REG_L="ld";
         $PTR_SLL="dsll";        # incidentally works even on n32
         $SZREG=8;
  } else {
-       $PTR_ADD="add";
-       $PTR_SUB="sub";
+       $PTR_LA="la";
+       $PTR_ADD="addu";
+       $PTR_SUB="subu";
         $REG_S="sw";
         $REG_L="lw";
         $PTR_SLL="sll";
         $SZREG=4;
  }
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
  #
  # <appro@openssl.org>
  #
  ######################################################################
  
-$output=shift;
-for (@ARGV) {  $big_endian=1 if (/\-DB_ENDIAN/);
-               $big_endian=0 if (/\-DL_ENDIAN/);
-               $output=$_ if (/^\w[\w\-]*\.\w+$/);     }
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
+
+for (@ARGV) {  $output=$_ if (/\w[\w\-]*\.\w+$/);      }
  open STDOUT,">$output";
  
  if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
@@ -82,6 +96,7 @@ if ($output =~ /512/) {
         $SLL="dsll";            # shift left logical
         $SRL="dsrl";            # shift right logical
         $ADDU="daddu";
+       $ROTR="drotr";
         @Sigma0=(28,34,39);
         @Sigma1=(14,18,41);
         @sigma0=( 7, 1, 8);     # right shift first
@@ -96,6 +111,7 @@ if ($output =~ /512/) {
         $SLL="sll";             # shift left logical
         $SRL="srl";             # shift right logical
         $ADDU="addu";
+       $ROTR="rotr";
         @Sigma0=( 2,13,22);
         @Sigma1=( 6,11,25);
         @sigma0=( 3, 7,18);     # right shift first
@@ -104,6 +120,9 @@ if ($output =~ /512/) {
         $rounds=64;
  }
  
+$MSB = $big_endian ? 0 : ($SZ-1);
+$LSB = ($SZ-1)&~$MSB;
+
  @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
  @X=map("\$$_",(8..23));
  
@@ -116,21 +135,77 @@ my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
  my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
  
  $code.=<<___ if ($i<15);
+#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
+       ${LD}   @X[1],`($i+1)*$SZ`($inp)
+#else
         ${LD}l  @X[1],`($i+1)*$SZ+$MSB`($inp)
         ${LD}r  @X[1],`($i+1)*$SZ+$LSB`($inp)
+#endif
  ___
-$code.=<<___   if (!$big_endian && $i<16);     # XXX no 64-bit byte swap yet
-       srl     $tmp0,@X[0],24  # byte swap($i)
+$code.=<<___   if (!$big_endian && $i<16 && $SZ==4);
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+       wsbh    @X[0],@X[0]             # byte swap($i)
+       rotr    @X[0],@X[0],16
+#else
+       srl     $tmp0,@X[0],24          # byte swap($i)
         srl     $tmp1,@X[0],8
         andi    $tmp2,@X[0],0xFF00
         sll     @X[0],@X[0],24
         andi    $tmp1,0xFF00
         sll     $tmp2,$tmp2,8
         or      @X[0],$tmp0
-       or      $tmp1,$t2
+       or      $tmp1,$tmp2
+       or      @X[0],$tmp1
+#endif
+___
+$code.=<<___   if (!$big_endian && $i<16 && $SZ==8);
+#if defined(_MIPS_ARCH_MIPS64R2)
+       dsbh    @X[0],@X[0]             # byte swap($i)
+       dshd    @X[0],@X[0]
+#else
+       ori     $tmp0,$zero,0xFF
+       dsll    $tmp2,$tmp0,32
+       or      $tmp0,$tmp2             # 0x000000FF000000FF
+       and     $tmp1,@X[0],$tmp0       # byte swap($i)
+       dsrl    $tmp2,@X[0],24
+       dsll    $tmp1,24
+       and     $tmp2,$tmp0
+       dsll    $tmp0,8                 # 0x0000FF000000FF00
+       or      $tmp1,$tmp2
+       and     $tmp2,@X[0],$tmp0
+       dsrl    @X[0],8
+       dsll    $tmp2,8
+       and     @X[0],$tmp0
+       or      $tmp1,$tmp2
+       or      @X[0],$tmp1
+       dsrl    $tmp1,@X[0],32
+       dsll    @X[0],32
         or      @X[0],$tmp1
+#endif
  ___
  $code.=<<___;
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+       xor     $tmp2,$f,$g                     # $i
+       $ROTR   $tmp0,$e,@Sigma1[0]
+       $ADDU   $T1,$X[0],$h
+       $ROTR   $tmp1,$e,@Sigma1[1]
+       and     $tmp2,$e
+       $ROTR   $h,$e,@Sigma1[2]
+       xor     $tmp0,$tmp1
+       $ROTR   $tmp1,$a,@Sigma0[0]
+       xor     $tmp2,$g                        # Ch(e,f,g)
+       xor     $tmp0,$h                        # Sigma1(e)
+
+       $ROTR   $h,$a,@Sigma0[1]
+       $ADDU   $T1,$tmp2
+       $LD     $tmp2,`$i*$SZ`($Ktbl)           # K[$i]
+       xor     $h,$tmp1
+       $ROTR   $tmp1,$a,@Sigma0[2]
+       $ADDU   $T1,$tmp0
+       and     $tmp0,$b,$c
+       xor     $h,$tmp1                        # Sigma0(a)
+       xor     $tmp1,$b,$c
+#else
         $ADDU   $T1,$X[0],$h                    # $i
         $SRL    $h,$e,@Sigma1[0]
         xor     $tmp2,$f,$g
@@ -160,21 +235,20 @@ $code.=<<___;
         xor     $h,$tmp1
         $SLL    $tmp1,$a,`$SZ*8-@Sigma0[0]`
         xor     $h,$tmp0
-       $ST     @X[0],`($i%16)*$SZ`($sp)
+       and     $tmp0,$b,$c
         xor     $h,$tmp1                        # Sigma0(a)
-
-       or      $tmp0,$a,$b
-       and     $tmp1,$a,$b
-       and     $tmp0,$c
-       or      $tmp1,$tmp0                     # Maj(a,b,c)
+       xor     $tmp1,$b,$c
+#endif
+       $ST     @X[0],`($i%16)*$SZ`($sp)        # offload to ring buffer
+       $ADDU   $h,$tmp0
+       and     $tmp1,$a
         $ADDU   $T1,$tmp2                       # +=K[$i]
-       $ADDU   $h,$tmp1
-
+       $ADDU   $h,$tmp1                        # +=Maj(a,b,c)
         $ADDU   $d,$T1
         $ADDU   $h,$T1
  ___
  $code.=<<___ if ($i>=13);
-       $LD     @X[3],`(($i+3)%16)*$SZ`($sp)    # prefetch
+       $LD     @X[3],`(($i+3)%16)*$SZ`($sp)    # prefetch from ring buffer
  ___
  }
  
@@ -183,6 +257,20 @@ my $i=@_[0];
  my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
  
  $code.=<<___;
+#if defined(_MIPS_ARCH_MIPS32R2) || defined(_MIPS_ARCH_MIPS64R2)
+       $SRL    $tmp2,@X[1],@sigma0[0]          # Xupdate($i)
+       $ROTR   $tmp0,@X[1],@sigma0[1]
+       $ADDU   @X[0],@X[9]                     # +=X[i+9]
+       xor     $tmp2,$tmp0
+       $ROTR   $tmp0,@X[1],@sigma0[2]
+
+       $SRL    $tmp3,@X[14],@sigma1[0]
+       $ROTR   $tmp1,@X[14],@sigma1[1]
+       xor     $tmp2,$tmp0                     # sigma0(X[i+1])
+       $ROTR   $tmp0,@X[14],@sigma1[2]
+       xor     $tmp3,$tmp1
+       $ADDU   @X[0],$tmp2
+#else
         $SRL    $tmp2,@X[1],@sigma0[0]          # Xupdate($i)
         $ADDU   @X[0],@X[9]                     # +=X[i+9]
         $SLL    $tmp1,@X[1],`$SZ*8-@sigma0[2]`
@@ -203,24 +291,24 @@ $code.=<<___;
         xor     $tmp3,$tmp0
         $SRL    $tmp0,@X[14],@sigma1[2]
         xor     $tmp3,$tmp1
-
+#endif
         xor     $tmp3,$tmp0                     # sigma1(X[i+14])
         $ADDU   @X[0],$tmp3
-
  ___
         &BODY_00_15(@_);
  }
  
  $FRAMESIZE=16*$SZ+16*$SZREG;
-$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
-$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
-$MSB = 0;
-$LSB = ($SZ-1)&~$MSB;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
  
  $code.=<<___;
+#include "mips_arch.h"
+
  .text
  .set   noat
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
  .option        pic2
+#endif
  
  .align 5
  .globl sha${label}_block_data_order
@@ -262,7 +350,7 @@ $code.=<<___ if ($flavour !~ /o32/i);       # non-o32 PIC-ification
  ___
  $code.=<<___;
         .set    reorder
-       la      $Ktbl,K${label}         # PIC-ified 'load address'
+       $PTR_LA $Ktbl,K${label}         # PIC-ified 'load address'
  
         $LD     $A,0*$SZ($ctx)          # load context
         $LD     $B,1*$SZ($ctx)
@@ -279,8 +367,12 @@ $code.=<<___;
  
  .align 5
  .Loop:
+#if defined(_MIPS_ARCH_MIPS32R6) || defined(_MIPS_ARCH_MIPS64R6)
+       ${LD}   @X[0],($inp)
+#else
         ${LD}l  @X[0],$MSB($inp)
         ${LD}r  @X[0],$LSB($inp)
+#endif
  ___
  for ($i=0;$i<16;$i++)
  { &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
@@ -294,12 +386,15 @@ for (;$i<32;$i++)
  $code.=<<___;
         and     @X[6],0xfff
         li      @X[7],$lastK
-       $PTR_ADD $Ktbl,16*$SZ           # Ktbl+=16
+       .set    noreorder
         bne     @X[6],@X[7],.L16_xx
+       $PTR_ADD $Ktbl,16*$SZ           # Ktbl+=16
  
+       $REG_L  @X[15],16*$SZ($sp)      # restore pointer to the end of input
         $LD     @X[0],0*$SZ($ctx)
         $LD     @X[1],1*$SZ($ctx)
         $LD     @X[2],2*$SZ($ctx)
+       $PTR_ADD $inp,16*$SZ
         $LD     @X[3],3*$SZ($ctx)
         $ADDU   $A,@X[0]
         $LD     @X[4],4*$SZ($ctx)
@@ -317,17 +412,14 @@ $code.=<<___;
         $ST     $C,2*$SZ($ctx)
         $ADDU   $H,@X[7]
         $ST     $D,3*$SZ($ctx)
-       $PTR_ADD $inp,16*$SZ
         $ST     $E,4*$SZ($ctx)
-       $REG_L  @X[15],16*$SZ($sp)      # restore pointer to the end of input
         $ST     $F,5*$SZ($ctx)
         $ST     $G,6*$SZ($ctx)
         $ST     $H,7*$SZ($ctx)
  
-       $PTR_SUB $Ktbl,`($rounds-16)*$SZ`       # rewind $Ktbl
         bne     $inp,@X[15],.Loop
+       $PTR_SUB $Ktbl,`($rounds-16)*$SZ`       # rewind $Ktbl
  
-       .set    noreorder
         $REG_L  $ra,$FRAMESIZE-1*$SZREG($sp)
         $REG_L  $fp,$FRAMESIZE-2*$SZREG($sp)
         $REG_L  $s11,$FRAMESIZE-3*$SZREG($sp)