crypto/sha/asm/sha512-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 #
  16 # Hardware SPARC T4 support by David S. Miller
  17 # ====================================================================
  18
  19 # SHA256 performance improvement over compiler generated code varies
  20 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  21 # build]. Just like in SHA1 module I aim to ensure scalability on
  22 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  23
  24 # SHA512 on pre-T1 UltraSPARC.
  25 #
  26 # Performance is >75% better than 64-bit code generated by Sun C and
  27 # over 2x than 32-bit code. X[16] resides on stack, but access to it
  28 # is scheduled for L2 latency and staged through 32 least significant
  29 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  30 # duality. Nevertheless it's ~40% faster than SHA256, which is pretty
  31 # good [optimal coefficient is 50%].
  32 #
  33 # SHA512 on UltraSPARC T1.
  34 #
  35 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  36 # because 64-bit code generator has the advantage of using 64-bit
  37 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  38 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  39 # code by 60%, not to mention that it doesn't suffer from severe decay
  40 # when running 4 times physical cores threads and that it leaves gcc
  41 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  42 # performance is only 10% better, but overall throughput for maximum
  43 # amount of threads for given CPU exceeds corresponding one of SHA256
  44 # by 30% [again, optimal coefficient is 50%].
  45 #
  46 # (*)   Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  47 #       in-order, i.e. load instruction has to complete prior next
  48 #       instruction in given thread is executed, even if the latter is
  49 #       not dependent on load result! This means that on T1 two 32-bit
  50 #       loads are always slower than one 64-bit load. Once again this
  51 #       is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  52 #       2x32-bit loads can be as fast as 1x64-bit ones.
  53 #
  54 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  55 # which is 9.3x/11.1x faster than software. Multi-process benchmark
  56 # saturates at 11.5x single-process result on 8-core processor, or
  57 # ~11/16GBps per 2.85GHz socket.
  58
  59 # $output is the last argument if it looks like a file (it has an extension)
  60 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  61
  62 $output and open STDOUT,">$output";
  63
  64 if ($output =~ /512/) {
  65         $label="512";
  66         $SZ=8;
  67         $LD="ldx";              # load from memory
  68         $ST="stx";              # store to memory
  69         $SLL="sllx";            # shift left logical
  70         $SRL="srlx";            # shift right logical
  71         @Sigma0=(28,34,39);
  72         @Sigma1=(14,18,41);
  73         @sigma0=( 7, 1, 8);     # right shift first
  74         @sigma1=( 6,19,61);     # right shift first
  75         $lastK=0x817;
  76         $rounds=80;
  77         $align=4;
  78
  79         $locals=16*$SZ;         # X[16]
  80
  81         $A="%o0";
  82         $B="%o1";
  83         $C="%o2";
  84         $D="%o3";
  85         $E="%o4";
  86         $F="%o5";
  87         $G="%g1";
  88         $H="%o7";
  89         @V=($A,$B,$C,$D,$E,$F,$G,$H);
  90 } else {
  91         $label="256";
  92         $SZ=4;
  93         $LD="ld";               # load from memory
  94         $ST="st";               # store to memory
  95         $SLL="sll";             # shift left logical
  96         $SRL="srl";             # shift right logical
  97         @Sigma0=( 2,13,22);
  98         @Sigma1=( 6,11,25);
  99         @sigma0=( 3, 7,18);     # right shift first
 100         @sigma1=(10,17,19);     # right shift first
 101         $lastK=0x8f2;
 102         $rounds=64;
 103         $align=8;
 104
 105         $locals=0;              # X[16] is register resident
 106         @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
 107
 108         $A="%l0";
 109         $B="%l1";
 110         $C="%l2";
 111         $D="%l3";
 112         $E="%l4";
 113         $F="%l5";
 114         $G="%l6";
 115         $H="%l7";
 116         @V=($A,$B,$C,$D,$E,$F,$G,$H);
 117 }
 118 $T1="%g2";
 119 $tmp0="%g3";
 120 $tmp1="%g4";
 121 $tmp2="%g5";
 122
 123 $ctx="%i0";
 124 $inp="%i1";
 125 $len="%i2";
 126 $Ktbl="%i3";
 127 $tmp31="%i4";
 128 $tmp32="%i5";
 129
 130 ########### SHA256
 131 $Xload = sub {
 132 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 133
 134     if ($i==0) {
 135 $code.=<<___;
 136         ldx     [$inp+0],@X[0]
 137         ldx     [$inp+16],@X[2]
 138         ldx     [$inp+32],@X[4]
 139         ldx     [$inp+48],@X[6]
 140         ldx     [$inp+8],@X[1]
 141         ldx     [$inp+24],@X[3]
 142         subcc   %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
 143         ldx     [$inp+40],@X[5]
 144         bz,pt   %icc,.Laligned
 145         ldx     [$inp+56],@X[7]
 146
 147         sllx    @X[0],$tmp31,@X[0]
 148         ldx     [$inp+64],$T1
 149 ___
 150 for($j=0;$j<7;$j++)
 151 {   $code.=<<___;
 152         srlx    @X[$j+1],$tmp32,$tmp1
 153         sllx    @X[$j+1],$tmp31,@X[$j+1]
 154         or      $tmp1,@X[$j],@X[$j]
 155 ___
 156 }
 157 $code.=<<___;
 158         srlx    $T1,$tmp32,$T1
 159         or      $T1,@X[7],@X[7]
 160 .Laligned:
 161 ___
 162     }
 163
 164     if ($i&1) {
 165         $code.="\tadd   @X[$i/2],$h,$T1\n";
 166     } else {
 167         $code.="\tsrlx  @X[$i/2],32,$T1\n\tadd  $h,$T1,$T1\n";
 168     }
 169 } if ($SZ==4);
 170
 171 ########### SHA512
 172 $Xload = sub {
 173 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 174 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
 175
 176 $code.=<<___ if ($i==0);
 177         ld      [$inp+0],%l0
 178         ld      [$inp+4],%l1
 179         ld      [$inp+8],%l2
 180         ld      [$inp+12],%l3
 181         ld      [$inp+16],%l4
 182         ld      [$inp+20],%l5
 183         ld      [$inp+24],%l6
 184         cmp     $tmp31,0
 185         ld      [$inp+28],%l7
 186 ___
 187 $code.=<<___ if ($i<15);
 188         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 189         add     $tmp31,32,$tmp0
 190         sllx    @pair[0],$tmp0,$tmp1
 191         `"ld    [$inp+".eval(32+0+$i*8)."],@pair[0]"    if ($i<12)`
 192         srlx    @pair[2],$tmp32,@pair[1]
 193         or      $tmp1,$tmp2,$tmp2
 194         or      @pair[1],$tmp2,$tmp2
 195         `"ld    [$inp+".eval(32+4+$i*8)."],@pair[1]"    if ($i<12)`
 196         add     $h,$tmp2,$T1
 197         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 198 ___
 199 $code.=<<___ if ($i==12);
 200         bnz,a,pn        %icc,.+8
 201         ld      [$inp+128],%l0
 202 ___
 203 $code.=<<___ if ($i==15);
 204         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 205         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 206         add     $tmp31,32,$tmp0
 207         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 208         sllx    @pair[0],$tmp0,$tmp1
 209         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 210         srlx    @pair[2],$tmp32,@pair[1]
 211         or      $tmp1,$tmp2,$tmp2
 212         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 213         or      @pair[1],$tmp2,$tmp2
 214         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 215         add     $h,$tmp2,$T1
 216         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 217         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 218         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 219         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 220 ___
 221 } if ($SZ==8);
 222
 223 ########### common
 224 sub BODY_00_15 {
 225 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 226
 227     if ($i<16) {
 228         &$Xload(@_);
 229     } else {
 230         $code.="\tadd   $h,$T1,$T1\n";
 231     }
 232
 233 $code.=<<___;
 234         $SRL    $e,@Sigma1[0],$h        !! $i
 235         xor     $f,$g,$tmp2
 236         $SLL    $e,`$SZ*8-@Sigma1[2]`,$tmp1
 237         and     $e,$tmp2,$tmp2
 238         $SRL    $e,@Sigma1[1],$tmp0
 239         xor     $tmp1,$h,$h
 240         $SLL    $e,`$SZ*8-@Sigma1[1]`,$tmp1
 241         xor     $tmp0,$h,$h
 242         $SRL    $e,@Sigma1[2],$tmp0
 243         xor     $tmp1,$h,$h
 244         $SLL    $e,`$SZ*8-@Sigma1[0]`,$tmp1
 245         xor     $tmp0,$h,$h
 246         xor     $g,$tmp2,$tmp2          ! Ch(e,f,g)
 247         xor     $tmp1,$h,$tmp0          ! Sigma1(e)
 248
 249         $SRL    $a,@Sigma0[0],$h
 250         add     $tmp2,$T1,$T1
 251         $LD     [$Ktbl+`$i*$SZ`],$tmp2  ! K[$i]
 252         $SLL    $a,`$SZ*8-@Sigma0[2]`,$tmp1
 253         add     $tmp0,$T1,$T1
 254         $SRL    $a,@Sigma0[1],$tmp0
 255         xor     $tmp1,$h,$h
 256         $SLL    $a,`$SZ*8-@Sigma0[1]`,$tmp1
 257         xor     $tmp0,$h,$h
 258         $SRL    $a,@Sigma0[2],$tmp0
 259         xor     $tmp1,$h,$h
 260         $SLL    $a,`$SZ*8-@Sigma0[0]`,$tmp1
 261         xor     $tmp0,$h,$h
 262         xor     $tmp1,$h,$h             ! Sigma0(a)
 263
 264         or      $a,$b,$tmp0
 265         and     $a,$b,$tmp1
 266         and     $c,$tmp0,$tmp0
 267         or      $tmp0,$tmp1,$tmp1       ! Maj(a,b,c)
 268         add     $tmp2,$T1,$T1           ! +=K[$i]
 269         add     $tmp1,$h,$h
 270
 271         add     $T1,$d,$d
 272         add     $T1,$h,$h
 273 ___
 274 }
 275
 276 ########### SHA256
 277 $BODY_16_XX = sub {
 278 my $i=@_[0];
 279 my $xi;
 280
 281     if ($i&1) {
 282         $xi=$tmp32;
 283         $code.="\tsrlx  @X[(($i+1)/2)%8],32,$xi\n";
 284     } else {
 285         $xi=@X[(($i+1)/2)%8];
 286     }
 287 $code.=<<___;
 288         srl     $xi,@sigma0[0],$T1              !! Xupdate($i)
 289         sll     $xi,`32-@sigma0[2]`,$tmp1
 290         srl     $xi,@sigma0[1],$tmp0
 291         xor     $tmp1,$T1,$T1
 292         sll     $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 293         xor     $tmp0,$T1,$T1
 294         srl     $xi,@sigma0[2],$tmp0
 295         xor     $tmp1,$T1,$T1
 296 ___
 297     if ($i&1) {
 298         $xi=@X[(($i+14)/2)%8];
 299     } else {
 300         $xi=$tmp32;
 301         $code.="\tsrlx  @X[(($i+14)/2)%8],32,$xi\n";
 302     }
 303 $code.=<<___;
 304         srl     $xi,@sigma1[0],$tmp2
 305         xor     $tmp0,$T1,$T1                   ! T1=sigma0(X[i+1])
 306         sll     $xi,`32-@sigma1[2]`,$tmp1
 307         srl     $xi,@sigma1[1],$tmp0
 308         xor     $tmp1,$tmp2,$tmp2
 309         sll     $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
 310         xor     $tmp0,$tmp2,$tmp2
 311         srl     $xi,@sigma1[2],$tmp0
 312         xor     $tmp1,$tmp2,$tmp2
 313 ___
 314     if ($i&1) {
 315         $xi=@X[($i/2)%8];
 316 $code.=<<___;
 317         srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
 318         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 319         srl     @X[($i/2)%8],0,$tmp0
 320         add     $tmp2,$tmp1,$tmp1
 321         add     $xi,$T1,$T1                     ! +=X[i]
 322         xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 323         add     $tmp1,$T1,$T1
 324
 325         srl     $T1,0,$T1
 326         or      $T1,@X[($i/2)%8],@X[($i/2)%8]
 327 ___
 328     } else {
 329         $xi=@X[(($i+9)/2)%8];
 330 $code.=<<___;
 331         srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
 332         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 333         add     $xi,$T1,$T1                     ! +=X[i+9]
 334         add     $tmp2,$tmp1,$tmp1
 335         srl     @X[($i/2)%8],0,@X[($i/2)%8]
 336         add     $tmp1,$T1,$T1
 337
 338         sllx    $T1,32,$tmp0
 339         or      $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 340 ___
 341     }
 342     &BODY_00_15(@_);
 343 } if ($SZ==4);
 344
 345 ########### SHA512
 346 $BODY_16_XX = sub {
 347 my $i=@_[0];
 348 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
 349
 350 $code.=<<___;
 351         sllx    %l2,32,$tmp0            !! Xupdate($i)
 352         or      %l3,$tmp0,$tmp0
 353
 354         srlx    $tmp0,@sigma0[0],$T1
 355         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 356         sllx    $tmp0,`64-@sigma0[2]`,$tmp1
 357         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 358         srlx    $tmp0,@sigma0[1],$tmp0
 359         xor     $tmp1,$T1,$T1
 360         sllx    $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 361         xor     $tmp0,$T1,$T1
 362         srlx    $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
 363         xor     $tmp1,$T1,$T1
 364         sllx    %l6,32,$tmp2
 365         xor     $tmp0,$T1,$T1           ! sigma0(X[$i+1])
 366         or      %l7,$tmp2,$tmp2
 367
 368         srlx    $tmp2,@sigma1[0],$tmp1
 369         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 370         sllx    $tmp2,`64-@sigma1[2]`,$tmp0
 371         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 372         srlx    $tmp2,@sigma1[1],$tmp2
 373         xor     $tmp0,$tmp1,$tmp1
 374         sllx    $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
 375         xor     $tmp2,$tmp1,$tmp1
 376         srlx    $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
 377         xor     $tmp0,$tmp1,$tmp1
 378         sllx    %l4,32,$tmp0
 379         xor     $tmp2,$tmp1,$tmp1       ! sigma1(X[$i+14])
 380         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 381         or      %l5,$tmp0,$tmp0
 382         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 383
 384         sllx    %l0,32,$tmp2
 385         add     $tmp1,$T1,$T1
 386         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 387         or      %l1,$tmp2,$tmp2
 388         add     $tmp0,$T1,$T1           ! +=X[$i+9]
 389         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 390         add     $tmp2,$T1,$T1           ! +=X[$i]
 391         $ST     $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
 392 ___
 393     &BODY_00_15(@_);
 394 } if ($SZ==8);
 395
 396 $code.=<<___;
 397 #include "sparc_arch.h"
 398
 399 #ifdef __arch64__
 400 .register       %g2,#scratch
 401 .register       %g3,#scratch
 402 #endif
 403
 404 .section        ".text",#alloc,#execinstr
 405
 406 .align  64
 407 K${label}:
 408 .type   K${label},#object
 409 ___
 410 if ($SZ==4) {
 411 $code.=<<___;
 412         .long   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
 413         .long   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
 414         .long   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
 415         .long   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
 416         .long   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
 417         .long   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
 418         .long   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
 419         .long   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
 420         .long   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
 421         .long   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
 422         .long   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
 423         .long   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
 424         .long   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
 425         .long   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
 426         .long   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 427         .long   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 428 ___
 429 } else {
 430 $code.=<<___;
 431         .long   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
 432         .long   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
 433         .long   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
 434         .long   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
 435         .long   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
 436         .long   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
 437         .long   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
 438         .long   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
 439         .long   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
 440         .long   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
 441         .long   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
 442         .long   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
 443         .long   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
 444         .long   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
 445         .long   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
 446         .long   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
 447         .long   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
 448         .long   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
 449         .long   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
 450         .long   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
 451         .long   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
 452         .long   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
 453         .long   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
 454         .long   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
 455         .long   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
 456         .long   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
 457         .long   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
 458         .long   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
 459         .long   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
 460         .long   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
 461         .long   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
 462         .long   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
 463         .long   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
 464         .long   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
 465         .long   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
 466         .long   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
 467         .long   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
 468         .long   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
 469         .long   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
 470         .long   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
 471 ___
 472 }
 473 $code.=<<___;
 474 .size   K${label},.-K${label}
 475
 476 #ifdef __PIC__
 477 SPARC_PIC_THUNK(%g1)
 478 #endif
 479
 480 .globl  sha${label}_block_data_order
 481 .align  32
 482 sha${label}_block_data_order:
 483         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 484         ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
 485
 486         andcc   %g1, CFR_SHA${label}, %g0
 487         be      .Lsoftware
 488         nop
 489 ___
 490 $code.=<<___ if ($SZ==8);               # SHA512
 491         ldd     [%o0 + 0x00], %f0       ! load context
 492         ldd     [%o0 + 0x08], %f2
 493         ldd     [%o0 + 0x10], %f4
 494         ldd     [%o0 + 0x18], %f6
 495         ldd     [%o0 + 0x20], %f8
 496         ldd     [%o0 + 0x28], %f10
 497         andcc   %o1, 0x7, %g0
 498         ldd     [%o0 + 0x30], %f12
 499         bne,pn  %icc, .Lhwunaligned
 500          ldd    [%o0 + 0x38], %f14
 501
 502 .Lhwaligned_loop:
 503         ldd     [%o1 + 0x00], %f16
 504         ldd     [%o1 + 0x08], %f18
 505         ldd     [%o1 + 0x10], %f20
 506         ldd     [%o1 + 0x18], %f22
 507         ldd     [%o1 + 0x20], %f24
 508         ldd     [%o1 + 0x28], %f26
 509         ldd     [%o1 + 0x30], %f28
 510         ldd     [%o1 + 0x38], %f30
 511         ldd     [%o1 + 0x40], %f32
 512         ldd     [%o1 + 0x48], %f34
 513         ldd     [%o1 + 0x50], %f36
 514         ldd     [%o1 + 0x58], %f38
 515         ldd     [%o1 + 0x60], %f40
 516         ldd     [%o1 + 0x68], %f42
 517         ldd     [%o1 + 0x70], %f44
 518         subcc   %o2, 1, %o2             ! done yet?
 519         ldd     [%o1 + 0x78], %f46
 520         add     %o1, 0x80, %o1
 521         prefetch [%o1 + 63], 20
 522         prefetch [%o1 + 64+63], 20
 523
 524         .word   0x81b02860              ! SHA512
 525
 526         bne,pt  SIZE_T_CC, .Lhwaligned_loop
 527         nop
 528
 529 .Lhwfinish:
 530         std     %f0, [%o0 + 0x00]       ! store context
 531         std     %f2, [%o0 + 0x08]
 532         std     %f4, [%o0 + 0x10]
 533         std     %f6, [%o0 + 0x18]
 534         std     %f8, [%o0 + 0x20]
 535         std     %f10, [%o0 + 0x28]
 536         std     %f12, [%o0 + 0x30]
 537         retl
 538          std    %f14, [%o0 + 0x38]
 539
 540 .align  16
 541 .Lhwunaligned:
 542         alignaddr %o1, %g0, %o1
 543
 544         ldd     [%o1 + 0x00], %f18
 545 .Lhwunaligned_loop:
 546         ldd     [%o1 + 0x08], %f20
 547         ldd     [%o1 + 0x10], %f22
 548         ldd     [%o1 + 0x18], %f24
 549         ldd     [%o1 + 0x20], %f26
 550         ldd     [%o1 + 0x28], %f28
 551         ldd     [%o1 + 0x30], %f30
 552         ldd     [%o1 + 0x38], %f32
 553         ldd     [%o1 + 0x40], %f34
 554         ldd     [%o1 + 0x48], %f36
 555         ldd     [%o1 + 0x50], %f38
 556         ldd     [%o1 + 0x58], %f40
 557         ldd     [%o1 + 0x60], %f42
 558         ldd     [%o1 + 0x68], %f44
 559         ldd     [%o1 + 0x70], %f46
 560         ldd     [%o1 + 0x78], %f48
 561         subcc   %o2, 1, %o2             ! done yet?
 562         ldd     [%o1 + 0x80], %f50
 563         add     %o1, 0x80, %o1
 564         prefetch [%o1 + 63], 20
 565         prefetch [%o1 + 64+63], 20
 566
 567         faligndata %f18, %f20, %f16
 568         faligndata %f20, %f22, %f18
 569         faligndata %f22, %f24, %f20
 570         faligndata %f24, %f26, %f22
 571         faligndata %f26, %f28, %f24
 572         faligndata %f28, %f30, %f26
 573         faligndata %f30, %f32, %f28
 574         faligndata %f32, %f34, %f30
 575         faligndata %f34, %f36, %f32
 576         faligndata %f36, %f38, %f34
 577         faligndata %f38, %f40, %f36
 578         faligndata %f40, %f42, %f38
 579         faligndata %f42, %f44, %f40
 580         faligndata %f44, %f46, %f42
 581         faligndata %f46, %f48, %f44
 582         faligndata %f48, %f50, %f46
 583
 584         .word   0x81b02860              ! SHA512
 585
 586         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 587         for     %f50, %f50, %f18        ! %f18=%f50
 588
 589         ba      .Lhwfinish
 590         nop
 591 ___
 592 $code.=<<___ if ($SZ==4);               # SHA256
 593         ld      [%o0 + 0x00], %f0
 594         ld      [%o0 + 0x04], %f1
 595         ld      [%o0 + 0x08], %f2
 596         ld      [%o0 + 0x0c], %f3
 597         ld      [%o0 + 0x10], %f4
 598         ld      [%o0 + 0x14], %f5
 599         andcc   %o1, 0x7, %g0
 600         ld      [%o0 + 0x18], %f6
 601         bne,pn  %icc, .Lhwunaligned
 602          ld     [%o0 + 0x1c], %f7
 603
 604 .Lhwloop:
 605         ldd     [%o1 + 0x00], %f8
 606         ldd     [%o1 + 0x08], %f10
 607         ldd     [%o1 + 0x10], %f12
 608         ldd     [%o1 + 0x18], %f14
 609         ldd     [%o1 + 0x20], %f16
 610         ldd     [%o1 + 0x28], %f18
 611         ldd     [%o1 + 0x30], %f20
 612         subcc   %o2, 1, %o2             ! done yet?
 613         ldd     [%o1 + 0x38], %f22
 614         add     %o1, 0x40, %o1
 615         prefetch [%o1 + 63], 20
 616
 617         .word   0x81b02840              ! SHA256
 618
 619         bne,pt  SIZE_T_CC, .Lhwloop
 620         nop
 621
 622 .Lhwfinish:
 623         st      %f0, [%o0 + 0x00]       ! store context
 624         st      %f1, [%o0 + 0x04]
 625         st      %f2, [%o0 + 0x08]
 626         st      %f3, [%o0 + 0x0c]
 627         st      %f4, [%o0 + 0x10]
 628         st      %f5, [%o0 + 0x14]
 629         st      %f6, [%o0 + 0x18]
 630         retl
 631          st     %f7, [%o0 + 0x1c]
 632
 633 .align  8
 634 .Lhwunaligned:
 635         alignaddr %o1, %g0, %o1
 636
 637         ldd     [%o1 + 0x00], %f10
 638 .Lhwunaligned_loop:
 639         ldd     [%o1 + 0x08], %f12
 640         ldd     [%o1 + 0x10], %f14
 641         ldd     [%o1 + 0x18], %f16
 642         ldd     [%o1 + 0x20], %f18
 643         ldd     [%o1 + 0x28], %f20
 644         ldd     [%o1 + 0x30], %f22
 645         ldd     [%o1 + 0x38], %f24
 646         subcc   %o2, 1, %o2             ! done yet?
 647         ldd     [%o1 + 0x40], %f26
 648         add     %o1, 0x40, %o1
 649         prefetch [%o1 + 63], 20
 650
 651         faligndata %f10, %f12, %f8
 652         faligndata %f12, %f14, %f10
 653         faligndata %f14, %f16, %f12
 654         faligndata %f16, %f18, %f14
 655         faligndata %f18, %f20, %f16
 656         faligndata %f20, %f22, %f18
 657         faligndata %f22, %f24, %f20
 658         faligndata %f24, %f26, %f22
 659
 660         .word   0x81b02840              ! SHA256
 661
 662         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 663         for     %f26, %f26, %f10        ! %f10=%f26
 664
 665         ba      .Lhwfinish
 666         nop
 667 ___
 668 $code.=<<___;
 669 .align  16
 670 .Lsoftware:
 671         save    %sp,-STACK_FRAME-$locals,%sp
 672         and     $inp,`$align-1`,$tmp31
 673         sllx    $len,`log(16*$SZ)/log(2)`,$len
 674         andn    $inp,`$align-1`,$inp
 675         sll     $tmp31,3,$tmp31
 676         add     $inp,$len,$len
 677 ___
 678 $code.=<<___ if ($SZ==8); # SHA512
 679         mov     32,$tmp32
 680         sub     $tmp32,$tmp31,$tmp32
 681 ___
 682 $code.=<<___;
 683 .Lpic:  call    .+8
 684         add     %o7,K${label}-.Lpic,$Ktbl
 685
 686         $LD     [$ctx+`0*$SZ`],$A
 687         $LD     [$ctx+`1*$SZ`],$B
 688         $LD     [$ctx+`2*$SZ`],$C
 689         $LD     [$ctx+`3*$SZ`],$D
 690         $LD     [$ctx+`4*$SZ`],$E
 691         $LD     [$ctx+`5*$SZ`],$F
 692         $LD     [$ctx+`6*$SZ`],$G
 693         $LD     [$ctx+`7*$SZ`],$H
 694
 695 .Lloop:
 696 ___
 697 for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 698 $code.=".L16_xx:\n";
 699 for (;$i<32;$i++)       { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 700 $code.=<<___;
 701         and     $tmp2,0xfff,$tmp2
 702         cmp     $tmp2,$lastK
 703         bne     .L16_xx
 704         add     $Ktbl,`16*$SZ`,$Ktbl    ! Ktbl+=16
 705
 706 ___
 707 $code.=<<___ if ($SZ==4); # SHA256
 708         $LD     [$ctx+`0*$SZ`],@X[0]
 709         $LD     [$ctx+`1*$SZ`],@X[1]
 710         $LD     [$ctx+`2*$SZ`],@X[2]
 711         $LD     [$ctx+`3*$SZ`],@X[3]
 712         $LD     [$ctx+`4*$SZ`],@X[4]
 713         $LD     [$ctx+`5*$SZ`],@X[5]
 714         $LD     [$ctx+`6*$SZ`],@X[6]
 715         $LD     [$ctx+`7*$SZ`],@X[7]
 716
 717         add     $A,@X[0],$A
 718         $ST     $A,[$ctx+`0*$SZ`]
 719         add     $B,@X[1],$B
 720         $ST     $B,[$ctx+`1*$SZ`]
 721         add     $C,@X[2],$C
 722         $ST     $C,[$ctx+`2*$SZ`]
 723         add     $D,@X[3],$D
 724         $ST     $D,[$ctx+`3*$SZ`]
 725         add     $E,@X[4],$E
 726         $ST     $E,[$ctx+`4*$SZ`]
 727         add     $F,@X[5],$F
 728         $ST     $F,[$ctx+`5*$SZ`]
 729         add     $G,@X[6],$G
 730         $ST     $G,[$ctx+`6*$SZ`]
 731         add     $H,@X[7],$H
 732         $ST     $H,[$ctx+`7*$SZ`]
 733 ___
 734 $code.=<<___ if ($SZ==8); # SHA512
 735         ld      [$ctx+`0*$SZ+0`],%l0
 736         ld      [$ctx+`0*$SZ+4`],%l1
 737         ld      [$ctx+`1*$SZ+0`],%l2
 738         ld      [$ctx+`1*$SZ+4`],%l3
 739         ld      [$ctx+`2*$SZ+0`],%l4
 740         ld      [$ctx+`2*$SZ+4`],%l5
 741         ld      [$ctx+`3*$SZ+0`],%l6
 742
 743         sllx    %l0,32,$tmp0
 744         ld      [$ctx+`3*$SZ+4`],%l7
 745         sllx    %l2,32,$tmp1
 746         or      %l1,$tmp0,$tmp0
 747         or      %l3,$tmp1,$tmp1
 748         add     $tmp0,$A,$A
 749         add     $tmp1,$B,$B
 750         $ST     $A,[$ctx+`0*$SZ`]
 751         sllx    %l4,32,$tmp2
 752         $ST     $B,[$ctx+`1*$SZ`]
 753         sllx    %l6,32,$T1
 754         or      %l5,$tmp2,$tmp2
 755         or      %l7,$T1,$T1
 756         add     $tmp2,$C,$C
 757         $ST     $C,[$ctx+`2*$SZ`]
 758         add     $T1,$D,$D
 759         $ST     $D,[$ctx+`3*$SZ`]
 760
 761         ld      [$ctx+`4*$SZ+0`],%l0
 762         ld      [$ctx+`4*$SZ+4`],%l1
 763         ld      [$ctx+`5*$SZ+0`],%l2
 764         ld      [$ctx+`5*$SZ+4`],%l3
 765         ld      [$ctx+`6*$SZ+0`],%l4
 766         ld      [$ctx+`6*$SZ+4`],%l5
 767         ld      [$ctx+`7*$SZ+0`],%l6
 768
 769         sllx    %l0,32,$tmp0
 770         ld      [$ctx+`7*$SZ+4`],%l7
 771         sllx    %l2,32,$tmp1
 772         or      %l1,$tmp0,$tmp0
 773         or      %l3,$tmp1,$tmp1
 774         add     $tmp0,$E,$E
 775         add     $tmp1,$F,$F
 776         $ST     $E,[$ctx+`4*$SZ`]
 777         sllx    %l4,32,$tmp2
 778         $ST     $F,[$ctx+`5*$SZ`]
 779         sllx    %l6,32,$T1
 780         or      %l5,$tmp2,$tmp2
 781         or      %l7,$T1,$T1
 782         add     $tmp2,$G,$G
 783         $ST     $G,[$ctx+`6*$SZ`]
 784         add     $T1,$H,$H
 785         $ST     $H,[$ctx+`7*$SZ`]
 786 ___
 787 $code.=<<___;
 788         add     $inp,`16*$SZ`,$inp              ! advance inp
 789         cmp     $inp,$len
 790         bne     SIZE_T_CC,.Lloop
 791         sub     $Ktbl,`($rounds-16)*$SZ`,$Ktbl  ! rewind Ktbl
 792
 793         ret
 794         restore
 795 .type   sha${label}_block_data_order,#function
 796 .size   sha${label}_block_data_order,(.-sha${label}_block_data_order)
 797 .asciz  "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 798 .align  4
 799 ___
 800
 801 # Purpose of these subroutines is to explicitly encode VIS instructions,
 802 # so that one can compile the module without having to specify VIS
 803 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 804 # Idea is to reserve for option to produce "universal" binary and let
 805 # programmer detect if current CPU is VIS capable at run-time.
 806 sub unvis {
 807 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 808 my $ref,$opf;
 809 my %visopf = (  "faligndata"    => 0x048,
 810                 "for"           => 0x07c        );
 811
 812     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 813
 814     if ($opf=$visopf{$mnemonic}) {
 815         foreach ($rs1,$rs2,$rd) {
 816             return $ref if (!/%f([0-9]{1,2})/);
 817             $_=$1;
 818             if ($1>=32) {
 819                 return $ref if ($1&1);
 820                 # re-encode for upper double register addressing
 821                 $_=($1|$1>>5)&31;
 822             }
 823         }
 824
 825         return  sprintf ".word\t0x%08x !%s",
 826                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 827                         $ref;
 828     } else {
 829         return $ref;
 830     }
 831 }
 832 sub unalignaddr {
 833 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 834 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 835 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 836
 837     foreach ($rs1,$rs2,$rd) {
 838         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 839         else                    { return $ref; }
 840     }
 841     return  sprintf ".word\t0x%08x !%s",
 842                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 843                     $ref;
 844 }
 845
 846 foreach (split("\n",$code)) {
 847         s/\`([^\`]*)\`/eval $1/ge;
 848
 849         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
 850                 &unvis($1,$2,$3,$4)
 851          /ge;
 852         s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 853                 &unalignaddr($1,$2,$3,$4)
 854          /ge;
 855
 856         print $_,"\n";
 857 }
 858
 859 close STDOUT or die "error closing STDOUT: $!";