crypto/sha/asm/sha512-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 #
  16 # Hardware SPARC T4 support by David S. Miller
  17 # ====================================================================
  18
  19 # SHA256 performance improvement over compiler generated code varies
  20 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  21 # build]. Just like in SHA1 module I aim to ensure scalability on
  22 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  23
  24 # SHA512 on pre-T1 UltraSPARC.
  25 #
  26 # Performance is >75% better than 64-bit code generated by Sun C and
  27 # over 2x than 32-bit code. X[16] resides on stack, but access to it
  28 # is scheduled for L2 latency and staged through 32 least significant
  29 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  30 # duality. Nevertheless it's ~40% faster than SHA256, which is pretty
  31 # good [optimal coefficient is 50%].
  32 #
  33 # SHA512 on UltraSPARC T1.
  34 #
  35 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  36 # because 64-bit code generator has the advantage of using 64-bit
  37 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  38 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  39 # code by 60%, not to mention that it doesn't suffer from severe decay
  40 # when running 4 times physical cores threads and that it leaves gcc
  41 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  42 # performance is only 10% better, but overall throughput for maximum
  43 # amount of threads for given CPU exceeds corresponding one of SHA256
  44 # by 30% [again, optimal coefficient is 50%].
  45 #
  46 # (*)   Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  47 #       in-order, i.e. load instruction has to complete prior next
  48 #       instruction in given thread is executed, even if the latter is
  49 #       not dependent on load result! This means that on T1 two 32-bit
  50 #       loads are always slower than one 64-bit load. Once again this
  51 #       is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  52 #       2x32-bit loads can be as fast as 1x64-bit ones.
  53 #
  54 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  55 # which is 9.3x/11.1x faster than software. Multi-process benchmark
  56 # saturates at 11.5x single-process result on 8-core processor, or
  57 # ~11/16GBps per 2.85GHz socket.
  58
  59 $output=pop;
  60 open STDOUT,">$output";
  61
  62 if ($output =~ /512/) {
  63         $label="512";
  64         $SZ=8;
  65         $LD="ldx";              # load from memory
  66         $ST="stx";              # store to memory
  67         $SLL="sllx";            # shift left logical
  68         $SRL="srlx";            # shift right logical
  69         @Sigma0=(28,34,39);
  70         @Sigma1=(14,18,41);
  71         @sigma0=( 7, 1, 8);     # right shift first
  72         @sigma1=( 6,19,61);     # right shift first
  73         $lastK=0x817;
  74         $rounds=80;
  75         $align=4;
  76
  77         $locals=16*$SZ;         # X[16]
  78
  79         $A="%o0";
  80         $B="%o1";
  81         $C="%o2";
  82         $D="%o3";
  83         $E="%o4";
  84         $F="%o5";
  85         $G="%g1";
  86         $H="%o7";
  87         @V=($A,$B,$C,$D,$E,$F,$G,$H);
  88 } else {
  89         $label="256";
  90         $SZ=4;
  91         $LD="ld";               # load from memory
  92         $ST="st";               # store to memory
  93         $SLL="sll";             # shift left logical
  94         $SRL="srl";             # shift right logical
  95         @Sigma0=( 2,13,22);
  96         @Sigma1=( 6,11,25);
  97         @sigma0=( 3, 7,18);     # right shift first
  98         @sigma1=(10,17,19);     # right shift first
  99         $lastK=0x8f2;
 100         $rounds=64;
 101         $align=8;
 102
 103         $locals=0;              # X[16] is register resident
 104         @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
 105
 106         $A="%l0";
 107         $B="%l1";
 108         $C="%l2";
 109         $D="%l3";
 110         $E="%l4";
 111         $F="%l5";
 112         $G="%l6";
 113         $H="%l7";
 114         @V=($A,$B,$C,$D,$E,$F,$G,$H);
 115 }
 116 $T1="%g2";
 117 $tmp0="%g3";
 118 $tmp1="%g4";
 119 $tmp2="%g5";
 120
 121 $ctx="%i0";
 122 $inp="%i1";
 123 $len="%i2";
 124 $Ktbl="%i3";
 125 $tmp31="%i4";
 126 $tmp32="%i5";
 127
 128 ########### SHA256
 129 $Xload = sub {
 130 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 131
 132     if ($i==0) {
 133 $code.=<<___;
 134         ldx     [$inp+0],@X[0]
 135         ldx     [$inp+16],@X[2]
 136         ldx     [$inp+32],@X[4]
 137         ldx     [$inp+48],@X[6]
 138         ldx     [$inp+8],@X[1]
 139         ldx     [$inp+24],@X[3]
 140         subcc   %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
 141         ldx     [$inp+40],@X[5]
 142         bz,pt   %icc,.Laligned
 143         ldx     [$inp+56],@X[7]
 144
 145         sllx    @X[0],$tmp31,@X[0]
 146         ldx     [$inp+64],$T1
 147 ___
 148 for($j=0;$j<7;$j++)
 149 {   $code.=<<___;
 150         srlx    @X[$j+1],$tmp32,$tmp1
 151         sllx    @X[$j+1],$tmp31,@X[$j+1]
 152         or      $tmp1,@X[$j],@X[$j]
 153 ___
 154 }
 155 $code.=<<___;
 156         srlx    $T1,$tmp32,$T1
 157         or      $T1,@X[7],@X[7]
 158 .Laligned:
 159 ___
 160     }
 161
 162     if ($i&1) {
 163         $code.="\tadd   @X[$i/2],$h,$T1\n";
 164     } else {
 165         $code.="\tsrlx  @X[$i/2],32,$T1\n\tadd  $h,$T1,$T1\n";
 166     }
 167 } if ($SZ==4);
 168
 169 ########### SHA512
 170 $Xload = sub {
 171 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 172 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
 173
 174 $code.=<<___ if ($i==0);
 175         ld      [$inp+0],%l0
 176         ld      [$inp+4],%l1
 177         ld      [$inp+8],%l2
 178         ld      [$inp+12],%l3
 179         ld      [$inp+16],%l4
 180         ld      [$inp+20],%l5
 181         ld      [$inp+24],%l6
 182         cmp     $tmp31,0
 183         ld      [$inp+28],%l7
 184 ___
 185 $code.=<<___ if ($i<15);
 186         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 187         add     $tmp31,32,$tmp0
 188         sllx    @pair[0],$tmp0,$tmp1
 189         `"ld    [$inp+".eval(32+0+$i*8)."],@pair[0]"    if ($i<12)`
 190         srlx    @pair[2],$tmp32,@pair[1]
 191         or      $tmp1,$tmp2,$tmp2
 192         or      @pair[1],$tmp2,$tmp2
 193         `"ld    [$inp+".eval(32+4+$i*8)."],@pair[1]"    if ($i<12)`
 194         add     $h,$tmp2,$T1
 195         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 196 ___
 197 $code.=<<___ if ($i==12);
 198         bnz,a,pn        %icc,.+8
 199         ld      [$inp+128],%l0
 200 ___
 201 $code.=<<___ if ($i==15);
 202         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 203         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 204         add     $tmp31,32,$tmp0
 205         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 206         sllx    @pair[0],$tmp0,$tmp1
 207         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 208         srlx    @pair[2],$tmp32,@pair[1]
 209         or      $tmp1,$tmp2,$tmp2
 210         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 211         or      @pair[1],$tmp2,$tmp2
 212         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 213         add     $h,$tmp2,$T1
 214         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 215         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 216         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 217         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 218 ___
 219 } if ($SZ==8);
 220
 221 ########### common
 222 sub BODY_00_15 {
 223 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 224
 225     if ($i<16) {
 226         &$Xload(@_);
 227     } else {
 228         $code.="\tadd   $h,$T1,$T1\n";
 229     }
 230
 231 $code.=<<___;
 232         $SRL    $e,@Sigma1[0],$h        !! $i
 233         xor     $f,$g,$tmp2
 234         $SLL    $e,`$SZ*8-@Sigma1[2]`,$tmp1
 235         and     $e,$tmp2,$tmp2
 236         $SRL    $e,@Sigma1[1],$tmp0
 237         xor     $tmp1,$h,$h
 238         $SLL    $e,`$SZ*8-@Sigma1[1]`,$tmp1
 239         xor     $tmp0,$h,$h
 240         $SRL    $e,@Sigma1[2],$tmp0
 241         xor     $tmp1,$h,$h
 242         $SLL    $e,`$SZ*8-@Sigma1[0]`,$tmp1
 243         xor     $tmp0,$h,$h
 244         xor     $g,$tmp2,$tmp2          ! Ch(e,f,g)
 245         xor     $tmp1,$h,$tmp0          ! Sigma1(e)
 246
 247         $SRL    $a,@Sigma0[0],$h
 248         add     $tmp2,$T1,$T1
 249         $LD     [$Ktbl+`$i*$SZ`],$tmp2  ! K[$i]
 250         $SLL    $a,`$SZ*8-@Sigma0[2]`,$tmp1
 251         add     $tmp0,$T1,$T1
 252         $SRL    $a,@Sigma0[1],$tmp0
 253         xor     $tmp1,$h,$h
 254         $SLL    $a,`$SZ*8-@Sigma0[1]`,$tmp1
 255         xor     $tmp0,$h,$h
 256         $SRL    $a,@Sigma0[2],$tmp0
 257         xor     $tmp1,$h,$h
 258         $SLL    $a,`$SZ*8-@Sigma0[0]`,$tmp1
 259         xor     $tmp0,$h,$h
 260         xor     $tmp1,$h,$h             ! Sigma0(a)
 261
 262         or      $a,$b,$tmp0
 263         and     $a,$b,$tmp1
 264         and     $c,$tmp0,$tmp0
 265         or      $tmp0,$tmp1,$tmp1       ! Maj(a,b,c)
 266         add     $tmp2,$T1,$T1           ! +=K[$i]
 267         add     $tmp1,$h,$h
 268
 269         add     $T1,$d,$d
 270         add     $T1,$h,$h
 271 ___
 272 }
 273
 274 ########### SHA256
 275 $BODY_16_XX = sub {
 276 my $i=@_[0];
 277 my $xi;
 278
 279     if ($i&1) {
 280         $xi=$tmp32;
 281         $code.="\tsrlx  @X[(($i+1)/2)%8],32,$xi\n";
 282     } else {
 283         $xi=@X[(($i+1)/2)%8];
 284     }
 285 $code.=<<___;
 286         srl     $xi,@sigma0[0],$T1              !! Xupdate($i)
 287         sll     $xi,`32-@sigma0[2]`,$tmp1
 288         srl     $xi,@sigma0[1],$tmp0
 289         xor     $tmp1,$T1,$T1
 290         sll     $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 291         xor     $tmp0,$T1,$T1
 292         srl     $xi,@sigma0[2],$tmp0
 293         xor     $tmp1,$T1,$T1
 294 ___
 295     if ($i&1) {
 296         $xi=@X[(($i+14)/2)%8];
 297     } else {
 298         $xi=$tmp32;
 299         $code.="\tsrlx  @X[(($i+14)/2)%8],32,$xi\n";
 300     }
 301 $code.=<<___;
 302         srl     $xi,@sigma1[0],$tmp2
 303         xor     $tmp0,$T1,$T1                   ! T1=sigma0(X[i+1])
 304         sll     $xi,`32-@sigma1[2]`,$tmp1
 305         srl     $xi,@sigma1[1],$tmp0
 306         xor     $tmp1,$tmp2,$tmp2
 307         sll     $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
 308         xor     $tmp0,$tmp2,$tmp2
 309         srl     $xi,@sigma1[2],$tmp0
 310         xor     $tmp1,$tmp2,$tmp2
 311 ___
 312     if ($i&1) {
 313         $xi=@X[($i/2)%8];
 314 $code.=<<___;
 315         srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
 316         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 317         srl     @X[($i/2)%8],0,$tmp0
 318         add     $tmp2,$tmp1,$tmp1
 319         add     $xi,$T1,$T1                     ! +=X[i]
 320         xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 321         add     $tmp1,$T1,$T1
 322
 323         srl     $T1,0,$T1
 324         or      $T1,@X[($i/2)%8],@X[($i/2)%8]
 325 ___
 326     } else {
 327         $xi=@X[(($i+9)/2)%8];
 328 $code.=<<___;
 329         srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
 330         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 331         add     $xi,$T1,$T1                     ! +=X[i+9]
 332         add     $tmp2,$tmp1,$tmp1
 333         srl     @X[($i/2)%8],0,@X[($i/2)%8]
 334         add     $tmp1,$T1,$T1
 335
 336         sllx    $T1,32,$tmp0
 337         or      $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 338 ___
 339     }
 340     &BODY_00_15(@_);
 341 } if ($SZ==4);
 342
 343 ########### SHA512
 344 $BODY_16_XX = sub {
 345 my $i=@_[0];
 346 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
 347
 348 $code.=<<___;
 349         sllx    %l2,32,$tmp0            !! Xupdate($i)
 350         or      %l3,$tmp0,$tmp0
 351
 352         srlx    $tmp0,@sigma0[0],$T1
 353         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 354         sllx    $tmp0,`64-@sigma0[2]`,$tmp1
 355         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 356         srlx    $tmp0,@sigma0[1],$tmp0
 357         xor     $tmp1,$T1,$T1
 358         sllx    $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 359         xor     $tmp0,$T1,$T1
 360         srlx    $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
 361         xor     $tmp1,$T1,$T1
 362         sllx    %l6,32,$tmp2
 363         xor     $tmp0,$T1,$T1           ! sigma0(X[$i+1])
 364         or      %l7,$tmp2,$tmp2
 365
 366         srlx    $tmp2,@sigma1[0],$tmp1
 367         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 368         sllx    $tmp2,`64-@sigma1[2]`,$tmp0
 369         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 370         srlx    $tmp2,@sigma1[1],$tmp2
 371         xor     $tmp0,$tmp1,$tmp1
 372         sllx    $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
 373         xor     $tmp2,$tmp1,$tmp1
 374         srlx    $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
 375         xor     $tmp0,$tmp1,$tmp1
 376         sllx    %l4,32,$tmp0
 377         xor     $tmp2,$tmp1,$tmp1       ! sigma1(X[$i+14])
 378         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 379         or      %l5,$tmp0,$tmp0
 380         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 381
 382         sllx    %l0,32,$tmp2
 383         add     $tmp1,$T1,$T1
 384         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 385         or      %l1,$tmp2,$tmp2
 386         add     $tmp0,$T1,$T1           ! +=X[$i+9]
 387         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 388         add     $tmp2,$T1,$T1           ! +=X[$i]
 389         $ST     $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
 390 ___
 391     &BODY_00_15(@_);
 392 } if ($SZ==8);
 393
 394 $code.=<<___;
 395 #include "sparc_arch.h"
 396
 397 #ifdef __arch64__
 398 .register       %g2,#scratch
 399 .register       %g3,#scratch
 400 #endif
 401
 402 .section        ".text",#alloc,#execinstr
 403
 404 .align  64
 405 K${label}:
 406 .type   K${label},#object
 407 ___
 408 if ($SZ==4) {
 409 $code.=<<___;
 410         .long   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
 411         .long   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
 412         .long   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
 413         .long   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
 414         .long   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
 415         .long   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
 416         .long   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
 417         .long   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
 418         .long   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
 419         .long   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
 420         .long   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
 421         .long   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
 422         .long   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
 423         .long   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
 424         .long   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 425         .long   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 426 ___
 427 } else {
 428 $code.=<<___;
 429         .long   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
 430         .long   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
 431         .long   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
 432         .long   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
 433         .long   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
 434         .long   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
 435         .long   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
 436         .long   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
 437         .long   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
 438         .long   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
 439         .long   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
 440         .long   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
 441         .long   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
 442         .long   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
 443         .long   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
 444         .long   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
 445         .long   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
 446         .long   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
 447         .long   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
 448         .long   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
 449         .long   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
 450         .long   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
 451         .long   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
 452         .long   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
 453         .long   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
 454         .long   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
 455         .long   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
 456         .long   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
 457         .long   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
 458         .long   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
 459         .long   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
 460         .long   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
 461         .long   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
 462         .long   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
 463         .long   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
 464         .long   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
 465         .long   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
 466         .long   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
 467         .long   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
 468         .long   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
 469 ___
 470 }
 471 $code.=<<___;
 472 .size   K${label},.-K${label}
 473
 474 #ifdef __PIC__
 475 SPARC_PIC_THUNK(%g1)
 476 #endif
 477
 478 .globl  sha${label}_block_data_order
 479 .align  32
 480 sha${label}_block_data_order:
 481         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 482         ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
 483
 484         andcc   %g1, CFR_SHA${label}, %g0
 485         be      .Lsoftware
 486         nop
 487 ___
 488 $code.=<<___ if ($SZ==8);               # SHA512
 489         ldd     [%o0 + 0x00], %f0       ! load context
 490         ldd     [%o0 + 0x08], %f2
 491         ldd     [%o0 + 0x10], %f4
 492         ldd     [%o0 + 0x18], %f6
 493         ldd     [%o0 + 0x20], %f8
 494         ldd     [%o0 + 0x28], %f10
 495         andcc   %o1, 0x7, %g0
 496         ldd     [%o0 + 0x30], %f12
 497         bne,pn  %icc, .Lhwunaligned
 498          ldd    [%o0 + 0x38], %f14
 499
 500 .Lhwaligned_loop:
 501         ldd     [%o1 + 0x00], %f16
 502         ldd     [%o1 + 0x08], %f18
 503         ldd     [%o1 + 0x10], %f20
 504         ldd     [%o1 + 0x18], %f22
 505         ldd     [%o1 + 0x20], %f24
 506         ldd     [%o1 + 0x28], %f26
 507         ldd     [%o1 + 0x30], %f28
 508         ldd     [%o1 + 0x38], %f30
 509         ldd     [%o1 + 0x40], %f32
 510         ldd     [%o1 + 0x48], %f34
 511         ldd     [%o1 + 0x50], %f36
 512         ldd     [%o1 + 0x58], %f38
 513         ldd     [%o1 + 0x60], %f40
 514         ldd     [%o1 + 0x68], %f42
 515         ldd     [%o1 + 0x70], %f44
 516         subcc   %o2, 1, %o2             ! done yet?
 517         ldd     [%o1 + 0x78], %f46
 518         add     %o1, 0x80, %o1
 519         prefetch [%o1 + 63], 20
 520         prefetch [%o1 + 64+63], 20
 521
 522         .word   0x81b02860              ! SHA512
 523
 524         bne,pt  SIZE_T_CC, .Lhwaligned_loop
 525         nop
 526
 527 .Lhwfinish:
 528         std     %f0, [%o0 + 0x00]       ! store context
 529         std     %f2, [%o0 + 0x08]
 530         std     %f4, [%o0 + 0x10]
 531         std     %f6, [%o0 + 0x18]
 532         std     %f8, [%o0 + 0x20]
 533         std     %f10, [%o0 + 0x28]
 534         std     %f12, [%o0 + 0x30]
 535         retl
 536          std    %f14, [%o0 + 0x38]
 537
 538 .align  16
 539 .Lhwunaligned:
 540         alignaddr %o1, %g0, %o1
 541
 542         ldd     [%o1 + 0x00], %f18
 543 .Lhwunaligned_loop:
 544         ldd     [%o1 + 0x08], %f20
 545         ldd     [%o1 + 0x10], %f22
 546         ldd     [%o1 + 0x18], %f24
 547         ldd     [%o1 + 0x20], %f26
 548         ldd     [%o1 + 0x28], %f28
 549         ldd     [%o1 + 0x30], %f30
 550         ldd     [%o1 + 0x38], %f32
 551         ldd     [%o1 + 0x40], %f34
 552         ldd     [%o1 + 0x48], %f36
 553         ldd     [%o1 + 0x50], %f38
 554         ldd     [%o1 + 0x58], %f40
 555         ldd     [%o1 + 0x60], %f42
 556         ldd     [%o1 + 0x68], %f44
 557         ldd     [%o1 + 0x70], %f46
 558         ldd     [%o1 + 0x78], %f48
 559         subcc   %o2, 1, %o2             ! done yet?
 560         ldd     [%o1 + 0x80], %f50
 561         add     %o1, 0x80, %o1
 562         prefetch [%o1 + 63], 20
 563         prefetch [%o1 + 64+63], 20
 564
 565         faligndata %f18, %f20, %f16
 566         faligndata %f20, %f22, %f18
 567         faligndata %f22, %f24, %f20
 568         faligndata %f24, %f26, %f22
 569         faligndata %f26, %f28, %f24
 570         faligndata %f28, %f30, %f26
 571         faligndata %f30, %f32, %f28
 572         faligndata %f32, %f34, %f30
 573         faligndata %f34, %f36, %f32
 574         faligndata %f36, %f38, %f34
 575         faligndata %f38, %f40, %f36
 576         faligndata %f40, %f42, %f38
 577         faligndata %f42, %f44, %f40
 578         faligndata %f44, %f46, %f42
 579         faligndata %f46, %f48, %f44
 580         faligndata %f48, %f50, %f46
 581
 582         .word   0x81b02860              ! SHA512
 583
 584         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 585         for     %f50, %f50, %f18        ! %f18=%f50
 586
 587         ba      .Lhwfinish
 588         nop
 589 ___
 590 $code.=<<___ if ($SZ==4);               # SHA256
 591         ld      [%o0 + 0x00], %f0
 592         ld      [%o0 + 0x04], %f1
 593         ld      [%o0 + 0x08], %f2
 594         ld      [%o0 + 0x0c], %f3
 595         ld      [%o0 + 0x10], %f4
 596         ld      [%o0 + 0x14], %f5
 597         andcc   %o1, 0x7, %g0
 598         ld      [%o0 + 0x18], %f6
 599         bne,pn  %icc, .Lhwunaligned
 600          ld     [%o0 + 0x1c], %f7
 601
 602 .Lhwloop:
 603         ldd     [%o1 + 0x00], %f8
 604         ldd     [%o1 + 0x08], %f10
 605         ldd     [%o1 + 0x10], %f12
 606         ldd     [%o1 + 0x18], %f14
 607         ldd     [%o1 + 0x20], %f16
 608         ldd     [%o1 + 0x28], %f18
 609         ldd     [%o1 + 0x30], %f20
 610         subcc   %o2, 1, %o2             ! done yet?
 611         ldd     [%o1 + 0x38], %f22
 612         add     %o1, 0x40, %o1
 613         prefetch [%o1 + 63], 20
 614
 615         .word   0x81b02840              ! SHA256
 616
 617         bne,pt  SIZE_T_CC, .Lhwloop
 618         nop
 619
 620 .Lhwfinish:
 621         st      %f0, [%o0 + 0x00]       ! store context
 622         st      %f1, [%o0 + 0x04]
 623         st      %f2, [%o0 + 0x08]
 624         st      %f3, [%o0 + 0x0c]
 625         st      %f4, [%o0 + 0x10]
 626         st      %f5, [%o0 + 0x14]
 627         st      %f6, [%o0 + 0x18]
 628         retl
 629          st     %f7, [%o0 + 0x1c]
 630
 631 .align  8
 632 .Lhwunaligned:
 633         alignaddr %o1, %g0, %o1
 634
 635         ldd     [%o1 + 0x00], %f10
 636 .Lhwunaligned_loop:
 637         ldd     [%o1 + 0x08], %f12
 638         ldd     [%o1 + 0x10], %f14
 639         ldd     [%o1 + 0x18], %f16
 640         ldd     [%o1 + 0x20], %f18
 641         ldd     [%o1 + 0x28], %f20
 642         ldd     [%o1 + 0x30], %f22
 643         ldd     [%o1 + 0x38], %f24
 644         subcc   %o2, 1, %o2             ! done yet?
 645         ldd     [%o1 + 0x40], %f26
 646         add     %o1, 0x40, %o1
 647         prefetch [%o1 + 63], 20
 648
 649         faligndata %f10, %f12, %f8
 650         faligndata %f12, %f14, %f10
 651         faligndata %f14, %f16, %f12
 652         faligndata %f16, %f18, %f14
 653         faligndata %f18, %f20, %f16
 654         faligndata %f20, %f22, %f18
 655         faligndata %f22, %f24, %f20
 656         faligndata %f24, %f26, %f22
 657
 658         .word   0x81b02840              ! SHA256
 659
 660         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 661         for     %f26, %f26, %f10        ! %f10=%f26
 662
 663         ba      .Lhwfinish
 664         nop
 665 ___
 666 $code.=<<___;
 667 .align  16
 668 .Lsoftware:
 669         save    %sp,-STACK_FRAME-$locals,%sp
 670         and     $inp,`$align-1`,$tmp31
 671         sllx    $len,`log(16*$SZ)/log(2)`,$len
 672         andn    $inp,`$align-1`,$inp
 673         sll     $tmp31,3,$tmp31
 674         add     $inp,$len,$len
 675 ___
 676 $code.=<<___ if ($SZ==8); # SHA512
 677         mov     32,$tmp32
 678         sub     $tmp32,$tmp31,$tmp32
 679 ___
 680 $code.=<<___;
 681 .Lpic:  call    .+8
 682         add     %o7,K${label}-.Lpic,$Ktbl
 683
 684         $LD     [$ctx+`0*$SZ`],$A
 685         $LD     [$ctx+`1*$SZ`],$B
 686         $LD     [$ctx+`2*$SZ`],$C
 687         $LD     [$ctx+`3*$SZ`],$D
 688         $LD     [$ctx+`4*$SZ`],$E
 689         $LD     [$ctx+`5*$SZ`],$F
 690         $LD     [$ctx+`6*$SZ`],$G
 691         $LD     [$ctx+`7*$SZ`],$H
 692
 693 .Lloop:
 694 ___
 695 for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 696 $code.=".L16_xx:\n";
 697 for (;$i<32;$i++)       { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 698 $code.=<<___;
 699         and     $tmp2,0xfff,$tmp2
 700         cmp     $tmp2,$lastK
 701         bne     .L16_xx
 702         add     $Ktbl,`16*$SZ`,$Ktbl    ! Ktbl+=16
 703
 704 ___
 705 $code.=<<___ if ($SZ==4); # SHA256
 706         $LD     [$ctx+`0*$SZ`],@X[0]
 707         $LD     [$ctx+`1*$SZ`],@X[1]
 708         $LD     [$ctx+`2*$SZ`],@X[2]
 709         $LD     [$ctx+`3*$SZ`],@X[3]
 710         $LD     [$ctx+`4*$SZ`],@X[4]
 711         $LD     [$ctx+`5*$SZ`],@X[5]
 712         $LD     [$ctx+`6*$SZ`],@X[6]
 713         $LD     [$ctx+`7*$SZ`],@X[7]
 714
 715         add     $A,@X[0],$A
 716         $ST     $A,[$ctx+`0*$SZ`]
 717         add     $B,@X[1],$B
 718         $ST     $B,[$ctx+`1*$SZ`]
 719         add     $C,@X[2],$C
 720         $ST     $C,[$ctx+`2*$SZ`]
 721         add     $D,@X[3],$D
 722         $ST     $D,[$ctx+`3*$SZ`]
 723         add     $E,@X[4],$E
 724         $ST     $E,[$ctx+`4*$SZ`]
 725         add     $F,@X[5],$F
 726         $ST     $F,[$ctx+`5*$SZ`]
 727         add     $G,@X[6],$G
 728         $ST     $G,[$ctx+`6*$SZ`]
 729         add     $H,@X[7],$H
 730         $ST     $H,[$ctx+`7*$SZ`]
 731 ___
 732 $code.=<<___ if ($SZ==8); # SHA512
 733         ld      [$ctx+`0*$SZ+0`],%l0
 734         ld      [$ctx+`0*$SZ+4`],%l1
 735         ld      [$ctx+`1*$SZ+0`],%l2
 736         ld      [$ctx+`1*$SZ+4`],%l3
 737         ld      [$ctx+`2*$SZ+0`],%l4
 738         ld      [$ctx+`2*$SZ+4`],%l5
 739         ld      [$ctx+`3*$SZ+0`],%l6
 740
 741         sllx    %l0,32,$tmp0
 742         ld      [$ctx+`3*$SZ+4`],%l7
 743         sllx    %l2,32,$tmp1
 744         or      %l1,$tmp0,$tmp0
 745         or      %l3,$tmp1,$tmp1
 746         add     $tmp0,$A,$A
 747         add     $tmp1,$B,$B
 748         $ST     $A,[$ctx+`0*$SZ`]
 749         sllx    %l4,32,$tmp2
 750         $ST     $B,[$ctx+`1*$SZ`]
 751         sllx    %l6,32,$T1
 752         or      %l5,$tmp2,$tmp2
 753         or      %l7,$T1,$T1
 754         add     $tmp2,$C,$C
 755         $ST     $C,[$ctx+`2*$SZ`]
 756         add     $T1,$D,$D
 757         $ST     $D,[$ctx+`3*$SZ`]
 758
 759         ld      [$ctx+`4*$SZ+0`],%l0
 760         ld      [$ctx+`4*$SZ+4`],%l1
 761         ld      [$ctx+`5*$SZ+0`],%l2
 762         ld      [$ctx+`5*$SZ+4`],%l3
 763         ld      [$ctx+`6*$SZ+0`],%l4
 764         ld      [$ctx+`6*$SZ+4`],%l5
 765         ld      [$ctx+`7*$SZ+0`],%l6
 766
 767         sllx    %l0,32,$tmp0
 768         ld      [$ctx+`7*$SZ+4`],%l7
 769         sllx    %l2,32,$tmp1
 770         or      %l1,$tmp0,$tmp0
 771         or      %l3,$tmp1,$tmp1
 772         add     $tmp0,$E,$E
 773         add     $tmp1,$F,$F
 774         $ST     $E,[$ctx+`4*$SZ`]
 775         sllx    %l4,32,$tmp2
 776         $ST     $F,[$ctx+`5*$SZ`]
 777         sllx    %l6,32,$T1
 778         or      %l5,$tmp2,$tmp2
 779         or      %l7,$T1,$T1
 780         add     $tmp2,$G,$G
 781         $ST     $G,[$ctx+`6*$SZ`]
 782         add     $T1,$H,$H
 783         $ST     $H,[$ctx+`7*$SZ`]
 784 ___
 785 $code.=<<___;
 786         add     $inp,`16*$SZ`,$inp              ! advance inp
 787         cmp     $inp,$len
 788         bne     SIZE_T_CC,.Lloop
 789         sub     $Ktbl,`($rounds-16)*$SZ`,$Ktbl  ! rewind Ktbl
 790
 791         ret
 792         restore
 793 .type   sha${label}_block_data_order,#function
 794 .size   sha${label}_block_data_order,(.-sha${label}_block_data_order)
 795 .asciz  "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 796 .align  4
 797 ___
 798
 799 # Purpose of these subroutines is to explicitly encode VIS instructions,
 800 # so that one can compile the module without having to specify VIS
 801 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 802 # Idea is to reserve for option to produce "universal" binary and let
 803 # programmer detect if current CPU is VIS capable at run-time.
 804 sub unvis {
 805 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 806 my $ref,$opf;
 807 my %visopf = (  "faligndata"    => 0x048,
 808                 "for"           => 0x07c        );
 809
 810     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 811
 812     if ($opf=$visopf{$mnemonic}) {
 813         foreach ($rs1,$rs2,$rd) {
 814             return $ref if (!/%f([0-9]{1,2})/);
 815             $_=$1;
 816             if ($1>=32) {
 817                 return $ref if ($1&1);
 818                 # re-encode for upper double register addressing
 819                 $_=($1|$1>>5)&31;
 820             }
 821         }
 822
 823         return  sprintf ".word\t0x%08x !%s",
 824                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 825                         $ref;
 826     } else {
 827         return $ref;
 828     }
 829 }
 830 sub unalignaddr {
 831 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 832 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 833 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 834
 835     foreach ($rs1,$rs2,$rd) {
 836         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 837         else                    { return $ref; }
 838     }
 839     return  sprintf ".word\t0x%08x !%s",
 840                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 841                     $ref;
 842 }
 843
 844 foreach (split("\n",$code)) {
 845         s/\`([^\`]*)\`/eval $1/ge;
 846
 847         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
 848                 &unvis($1,$2,$3,$4)
 849          /ge;
 850         s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 851                 &unalignaddr($1,$2,$3,$4)
 852          /ge;
 853
 854         print $_,"\n";
 855 }
 856
 857 close STDOUT;