2 # Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
16 # Hardware SPARC T4 support by David S. Miller
17 # ====================================================================
19 # SHA256 performance improvement over compiler generated code varies
20 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
21 # build]. Just like in SHA1 module I aim to ensure scalability on
22 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
24 # SHA512 on pre-T1 UltraSPARC.
26 # Performance is >75% better than 64-bit code generated by Sun C and
27 # over 2x than 32-bit code. X[16] resides on stack, but access to it
28 # is scheduled for L2 latency and staged through 32 least significant
29 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
30 # duality. Nevertheless it's ~40% faster than SHA256, which is pretty
31 # good [optimal coefficient is 50%].
33 # SHA512 on UltraSPARC T1.
35 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
36 # because 64-bit code generator has the advantage of using 64-bit
37 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
38 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
39 # code by 60%, not to mention that it doesn't suffer from severe decay
40 # when running 4 times physical cores threads and that it leaves gcc
41 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
42 # performance is only 10% better, but overall throughput for maximum
43 # amount of threads for given CPU exceeds corresponding one of SHA256
44 # by 30% [again, optimal coefficient is 50%].
46 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
47 # in-order, i.e. load instruction has to complete prior next
48 # instruction in given thread is executed, even if the latter is
49 # not dependent on load result! This means that on T1 two 32-bit
50 # loads are always slower than one 64-bit load. Once again this
51 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
52 # 2x32-bit loads can be as fast as 1x64-bit ones.
54 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
55 # which is 9.3x/11.1x faster than software. Multi-process benchmark
56 # saturates at 11.5x single-process result on 8-core processor, or
57 # ~11/16GBps per 2.85GHz socket.
60 open STDOUT,">$output";
62 if ($output =~ /512/) {
65 $LD="ldx"; # load from memory
66 $ST="stx"; # store to memory
67 $SLL="sllx"; # shift left logical
68 $SRL="srlx"; # shift right logical
71 @sigma0=( 7, 1, 8); # right shift first
72 @sigma1=( 6,19,61); # right shift first
77 $locals=16*$SZ; # X[16]
87 @V=($A,$B,$C,$D,$E,$F,$G,$H);
91 $LD="ld"; # load from memory
92 $ST="st"; # store to memory
93 $SLL="sll"; # shift left logical
94 $SRL="srl"; # shift right logical
97 @sigma0=( 3, 7,18); # right shift first
98 @sigma1=(10,17,19); # right shift first
103 $locals=0; # X[16] is register resident
104 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
114 @V=($A,$B,$C,$D,$E,$F,$G,$H);
130 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
140 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
145 sllx @X[0],$tmp31,@X[0]
150 srlx @X[$j+1],$tmp32,$tmp1
151 sllx @X[$j+1],$tmp31,@X[$j+1]
152 or $tmp1,@X[$j],@X[$j]
163 $code.="\tadd @X[$i/2],$h,$T1\n";
165 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
171 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
172 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
174 $code.=<<___ if ($i==0);
185 $code.=<<___ if ($i<15);
186 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
188 sllx @pair[0],$tmp0,$tmp1
189 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
190 srlx @pair[2],$tmp32,@pair[1]
192 or @pair[1],$tmp2,$tmp2
193 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
195 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
197 $code.=<<___ if ($i==12);
201 $code.=<<___ if ($i==15);
202 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
203 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
205 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
206 sllx @pair[0],$tmp0,$tmp1
207 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
208 srlx @pair[2],$tmp32,@pair[1]
210 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
211 or @pair[1],$tmp2,$tmp2
212 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
214 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
215 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
216 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
217 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
223 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
228 $code.="\tadd $h,$T1,$T1\n";
232 $SRL $e,@Sigma1[0],$h !! $i
234 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
236 $SRL $e,@Sigma1[1],$tmp0
238 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
240 $SRL $e,@Sigma1[2],$tmp0
242 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
244 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
245 xor $tmp1,$h,$tmp0 ! Sigma1(e)
247 $SRL $a,@Sigma0[0],$h
249 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
250 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
252 $SRL $a,@Sigma0[1],$tmp0
254 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
256 $SRL $a,@Sigma0[2],$tmp0
258 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
260 xor $tmp1,$h,$h ! Sigma0(a)
265 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
266 add $tmp2,$T1,$T1 ! +=K[$i]
281 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
283 $xi=@X[(($i+1)/2)%8];
286 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
287 sll $xi,`32-@sigma0[2]`,$tmp1
288 srl $xi,@sigma0[1],$tmp0
290 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
292 srl $xi,@sigma0[2],$tmp0
296 $xi=@X[(($i+14)/2)%8];
299 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
302 srl $xi,@sigma1[0],$tmp2
303 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
304 sll $xi,`32-@sigma1[2]`,$tmp1
305 srl $xi,@sigma1[1],$tmp0
306 xor $tmp1,$tmp2,$tmp2
307 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
308 xor $tmp0,$tmp2,$tmp2
309 srl $xi,@sigma1[2],$tmp0
310 xor $tmp1,$tmp2,$tmp2
315 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
316 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
317 srl @X[($i/2)%8],0,$tmp0
318 add $tmp2,$tmp1,$tmp1
319 add $xi,$T1,$T1 ! +=X[i]
320 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
324 or $T1,@X[($i/2)%8],@X[($i/2)%8]
327 $xi=@X[(($i+9)/2)%8];
329 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
330 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
331 add $xi,$T1,$T1 ! +=X[i+9]
332 add $tmp2,$tmp1,$tmp1
333 srl @X[($i/2)%8],0,@X[($i/2)%8]
337 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
346 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
349 sllx %l2,32,$tmp0 !! Xupdate($i)
352 srlx $tmp0,@sigma0[0],$T1
353 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
354 sllx $tmp0,`64-@sigma0[2]`,$tmp1
355 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
356 srlx $tmp0,@sigma0[1],$tmp0
358 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
360 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
363 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
366 srlx $tmp2,@sigma1[0],$tmp1
367 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
368 sllx $tmp2,`64-@sigma1[2]`,$tmp0
369 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
370 srlx $tmp2,@sigma1[1],$tmp2
371 xor $tmp0,$tmp1,$tmp1
372 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
373 xor $tmp2,$tmp1,$tmp1
374 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
375 xor $tmp0,$tmp1,$tmp1
377 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
378 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
384 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
386 add $tmp0,$T1,$T1 ! +=X[$i+9]
387 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
388 add $tmp2,$T1,$T1 ! +=X[$i]
389 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
395 #include "sparc_arch.h"
398 .register %g2,#scratch
399 .register %g3,#scratch
402 .section ".text",#alloc,#execinstr
406 .type K${label},#object
410 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
411 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
412 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
413 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
414 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
415 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
416 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
417 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
418 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
419 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
420 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
421 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
422 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
423 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
424 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
425 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
429 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
430 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
431 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
432 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
433 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
434 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
435 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
436 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
437 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
438 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
439 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
440 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
441 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
442 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
443 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
444 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
445 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
446 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
447 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
448 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
449 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
450 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
451 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
452 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
453 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
454 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
455 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
456 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
457 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
458 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
459 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
460 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
461 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
462 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
463 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
464 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
465 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
466 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
467 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
468 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
472 .size K${label},.-K${label}
478 .globl sha${label}_block_data_order
480 sha${label}_block_data_order:
481 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
482 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
484 andcc %g1, CFR_SHA${label}, %g0
488 $code.=<<___ if ($SZ==8); # SHA512
489 ldd [%o0 + 0x00], %f0 ! load context
490 ldd [%o0 + 0x08], %f2
491 ldd [%o0 + 0x10], %f4
492 ldd [%o0 + 0x18], %f6
493 ldd [%o0 + 0x20], %f8
494 ldd [%o0 + 0x28], %f10
496 ldd [%o0 + 0x30], %f12
497 bne,pn %icc, .Lhwunaligned
498 ldd [%o0 + 0x38], %f14
501 ldd [%o1 + 0x00], %f16
502 ldd [%o1 + 0x08], %f18
503 ldd [%o1 + 0x10], %f20
504 ldd [%o1 + 0x18], %f22
505 ldd [%o1 + 0x20], %f24
506 ldd [%o1 + 0x28], %f26
507 ldd [%o1 + 0x30], %f28
508 ldd [%o1 + 0x38], %f30
509 ldd [%o1 + 0x40], %f32
510 ldd [%o1 + 0x48], %f34
511 ldd [%o1 + 0x50], %f36
512 ldd [%o1 + 0x58], %f38
513 ldd [%o1 + 0x60], %f40
514 ldd [%o1 + 0x68], %f42
515 ldd [%o1 + 0x70], %f44
516 subcc %o2, 1, %o2 ! done yet?
517 ldd [%o1 + 0x78], %f46
519 prefetch [%o1 + 63], 20
520 prefetch [%o1 + 64+63], 20
522 .word 0x81b02860 ! SHA512
524 bne,pt SIZE_T_CC, .Lhwaligned_loop
528 std %f0, [%o0 + 0x00] ! store context
529 std %f2, [%o0 + 0x08]
530 std %f4, [%o0 + 0x10]
531 std %f6, [%o0 + 0x18]
532 std %f8, [%o0 + 0x20]
533 std %f10, [%o0 + 0x28]
534 std %f12, [%o0 + 0x30]
536 std %f14, [%o0 + 0x38]
540 alignaddr %o1, %g0, %o1
542 ldd [%o1 + 0x00], %f18
544 ldd [%o1 + 0x08], %f20
545 ldd [%o1 + 0x10], %f22
546 ldd [%o1 + 0x18], %f24
547 ldd [%o1 + 0x20], %f26
548 ldd [%o1 + 0x28], %f28
549 ldd [%o1 + 0x30], %f30
550 ldd [%o1 + 0x38], %f32
551 ldd [%o1 + 0x40], %f34
552 ldd [%o1 + 0x48], %f36
553 ldd [%o1 + 0x50], %f38
554 ldd [%o1 + 0x58], %f40
555 ldd [%o1 + 0x60], %f42
556 ldd [%o1 + 0x68], %f44
557 ldd [%o1 + 0x70], %f46
558 ldd [%o1 + 0x78], %f48
559 subcc %o2, 1, %o2 ! done yet?
560 ldd [%o1 + 0x80], %f50
562 prefetch [%o1 + 63], 20
563 prefetch [%o1 + 64+63], 20
565 faligndata %f18, %f20, %f16
566 faligndata %f20, %f22, %f18
567 faligndata %f22, %f24, %f20
568 faligndata %f24, %f26, %f22
569 faligndata %f26, %f28, %f24
570 faligndata %f28, %f30, %f26
571 faligndata %f30, %f32, %f28
572 faligndata %f32, %f34, %f30
573 faligndata %f34, %f36, %f32
574 faligndata %f36, %f38, %f34
575 faligndata %f38, %f40, %f36
576 faligndata %f40, %f42, %f38
577 faligndata %f42, %f44, %f40
578 faligndata %f44, %f46, %f42
579 faligndata %f46, %f48, %f44
580 faligndata %f48, %f50, %f46
582 .word 0x81b02860 ! SHA512
584 bne,pt SIZE_T_CC, .Lhwunaligned_loop
585 for %f50, %f50, %f18 ! %f18=%f50
590 $code.=<<___ if ($SZ==4); # SHA256
599 bne,pn %icc, .Lhwunaligned
603 ldd [%o1 + 0x00], %f8
604 ldd [%o1 + 0x08], %f10
605 ldd [%o1 + 0x10], %f12
606 ldd [%o1 + 0x18], %f14
607 ldd [%o1 + 0x20], %f16
608 ldd [%o1 + 0x28], %f18
609 ldd [%o1 + 0x30], %f20
610 subcc %o2, 1, %o2 ! done yet?
611 ldd [%o1 + 0x38], %f22
613 prefetch [%o1 + 63], 20
615 .word 0x81b02840 ! SHA256
617 bne,pt SIZE_T_CC, .Lhwloop
621 st %f0, [%o0 + 0x00] ! store context
633 alignaddr %o1, %g0, %o1
635 ldd [%o1 + 0x00], %f10
637 ldd [%o1 + 0x08], %f12
638 ldd [%o1 + 0x10], %f14
639 ldd [%o1 + 0x18], %f16
640 ldd [%o1 + 0x20], %f18
641 ldd [%o1 + 0x28], %f20
642 ldd [%o1 + 0x30], %f22
643 ldd [%o1 + 0x38], %f24
644 subcc %o2, 1, %o2 ! done yet?
645 ldd [%o1 + 0x40], %f26
647 prefetch [%o1 + 63], 20
649 faligndata %f10, %f12, %f8
650 faligndata %f12, %f14, %f10
651 faligndata %f14, %f16, %f12
652 faligndata %f16, %f18, %f14
653 faligndata %f18, %f20, %f16
654 faligndata %f20, %f22, %f18
655 faligndata %f22, %f24, %f20
656 faligndata %f24, %f26, %f22
658 .word 0x81b02840 ! SHA256
660 bne,pt SIZE_T_CC, .Lhwunaligned_loop
661 for %f26, %f26, %f10 ! %f10=%f26
669 save %sp,-STACK_FRAME-$locals,%sp
670 and $inp,`$align-1`,$tmp31
671 sllx $len,`log(16*$SZ)/log(2)`,$len
672 andn $inp,`$align-1`,$inp
676 $code.=<<___ if ($SZ==8); # SHA512
678 sub $tmp32,$tmp31,$tmp32
682 add %o7,K${label}-.Lpic,$Ktbl
684 $LD [$ctx+`0*$SZ`],$A
685 $LD [$ctx+`1*$SZ`],$B
686 $LD [$ctx+`2*$SZ`],$C
687 $LD [$ctx+`3*$SZ`],$D
688 $LD [$ctx+`4*$SZ`],$E
689 $LD [$ctx+`5*$SZ`],$F
690 $LD [$ctx+`6*$SZ`],$G
691 $LD [$ctx+`7*$SZ`],$H
695 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
697 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
699 and $tmp2,0xfff,$tmp2
702 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
705 $code.=<<___ if ($SZ==4); # SHA256
706 $LD [$ctx+`0*$SZ`],@X[0]
707 $LD [$ctx+`1*$SZ`],@X[1]
708 $LD [$ctx+`2*$SZ`],@X[2]
709 $LD [$ctx+`3*$SZ`],@X[3]
710 $LD [$ctx+`4*$SZ`],@X[4]
711 $LD [$ctx+`5*$SZ`],@X[5]
712 $LD [$ctx+`6*$SZ`],@X[6]
713 $LD [$ctx+`7*$SZ`],@X[7]
716 $ST $A,[$ctx+`0*$SZ`]
718 $ST $B,[$ctx+`1*$SZ`]
720 $ST $C,[$ctx+`2*$SZ`]
722 $ST $D,[$ctx+`3*$SZ`]
724 $ST $E,[$ctx+`4*$SZ`]
726 $ST $F,[$ctx+`5*$SZ`]
728 $ST $G,[$ctx+`6*$SZ`]
730 $ST $H,[$ctx+`7*$SZ`]
732 $code.=<<___ if ($SZ==8); # SHA512
733 ld [$ctx+`0*$SZ+0`],%l0
734 ld [$ctx+`0*$SZ+4`],%l1
735 ld [$ctx+`1*$SZ+0`],%l2
736 ld [$ctx+`1*$SZ+4`],%l3
737 ld [$ctx+`2*$SZ+0`],%l4
738 ld [$ctx+`2*$SZ+4`],%l5
739 ld [$ctx+`3*$SZ+0`],%l6
742 ld [$ctx+`3*$SZ+4`],%l7
748 $ST $A,[$ctx+`0*$SZ`]
750 $ST $B,[$ctx+`1*$SZ`]
755 $ST $C,[$ctx+`2*$SZ`]
757 $ST $D,[$ctx+`3*$SZ`]
759 ld [$ctx+`4*$SZ+0`],%l0
760 ld [$ctx+`4*$SZ+4`],%l1
761 ld [$ctx+`5*$SZ+0`],%l2
762 ld [$ctx+`5*$SZ+4`],%l3
763 ld [$ctx+`6*$SZ+0`],%l4
764 ld [$ctx+`6*$SZ+4`],%l5
765 ld [$ctx+`7*$SZ+0`],%l6
768 ld [$ctx+`7*$SZ+4`],%l7
774 $ST $E,[$ctx+`4*$SZ`]
776 $ST $F,[$ctx+`5*$SZ`]
781 $ST $G,[$ctx+`6*$SZ`]
783 $ST $H,[$ctx+`7*$SZ`]
786 add $inp,`16*$SZ`,$inp ! advance inp
789 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
793 .type sha${label}_block_data_order,#function
794 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
795 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
799 # Purpose of these subroutines is to explicitly encode VIS instructions,
800 # so that one can compile the module without having to specify VIS
801 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
802 # Idea is to reserve for option to produce "universal" binary and let
803 # programmer detect if current CPU is VIS capable at run-time.
805 my ($mnemonic,$rs1,$rs2,$rd)=@_;
807 my %visopf = ( "faligndata" => 0x048,
810 $ref = "$mnemonic\t$rs1,$rs2,$rd";
812 if ($opf=$visopf{$mnemonic}) {
813 foreach ($rs1,$rs2,$rd) {
814 return $ref if (!/%f([0-9]{1,2})/);
817 return $ref if ($1&1);
818 # re-encode for upper double register addressing
823 return sprintf ".word\t0x%08x !%s",
824 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
831 my ($mnemonic,$rs1,$rs2,$rd)=@_;
832 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
833 my $ref="$mnemonic\t$rs1,$rs2,$rd";
835 foreach ($rs1,$rs2,$rd) {
836 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
837 else { return $ref; }
839 return sprintf ".word\t0x%08x !%s",
840 0x81b00300|$rd<<25|$rs1<<14|$rs2,
844 foreach (split("\n",$code)) {
845 s/\`([^\`]*)\`/eval $1/ge;
847 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
850 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
851 &unalignaddr($1,$2,$3,$4)