3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
10 # ====================================================================
12 # SHA256 performance improvement over compiler generated code varies
13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
14 # build]. Just like in SHA1 module I aim to ensure scalability on
15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
17 # SHA512 on pre-T1 UltraSPARC.
19 # Performance is >75% better than 64-bit code generated by Sun C and
20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
21 # is scheduled for L2 latency and staged through 32 least significant
22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
24 # good [optimal coefficient is 50%].
26 # SHA512 on UltraSPARC T1.
28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
29 # because 64-bit code generator has the advantage of using 64-bit
30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
32 # code by 60%, not to mention that it doesn't suffer from severe decay
33 # when running 4 times physical cores threads and that it leaves gcc
34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
35 # performance is only 10% better, but overall throughput for maximum
36 # amount of threads for given CPU exceeds corresponding one of SHA256
37 # by 30% [again, optimal coefficient is 50%].
39 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
40 # in-order, i.e. load instruction has to complete prior next
41 # instruction in given thread is executed, even if the latter is
42 # not dependent on load result! This means that on T1 two 32-bit
43 # loads are always slower than one 64-bit load. Once again this
44 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
45 # 2x32-bit loads can be as fast as 1x64-bit ones.
47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
49 # saturates at 11.5x single-process result on 8-core processor, or
50 # ~11/16GBps per 2.85GHz socket.
53 open STDOUT,">$output";
55 if ($output =~ /512/) {
58 $LD="ldx"; # load from memory
59 $ST="stx"; # store to memory
60 $SLL="sllx"; # shift left logical
61 $SRL="srlx"; # shift right logical
64 @sigma0=( 7, 1, 8); # right shift first
65 @sigma1=( 6,19,61); # right shift first
70 $locals=16*$SZ; # X[16]
80 @V=($A,$B,$C,$D,$E,$F,$G,$H);
84 $LD="ld"; # load from memory
85 $ST="st"; # store to memory
86 $SLL="sll"; # shift left logical
87 $SRL="srl"; # shift right logical
90 @sigma0=( 3, 7,18); # right shift first
91 @sigma1=(10,17,19); # right shift first
96 $locals=0; # X[16] is register resident
97 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
107 @V=($A,$B,$C,$D,$E,$F,$G,$H);
123 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
133 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
138 sllx @X[0],$tmp31,@X[0]
143 srlx @X[$j+1],$tmp32,$tmp1
144 sllx @X[$j+1],$tmp31,@X[$j+1]
145 or $tmp1,@X[$j],@X[$j]
156 $code.="\tadd @X[$i/2],$h,$T1\n";
158 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
164 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
165 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
167 $code.=<<___ if ($i==0);
178 $code.=<<___ if ($i<15);
179 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
181 sllx @pair[0],$tmp0,$tmp1
182 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
183 srlx @pair[2],$tmp32,@pair[1]
185 or @pair[1],$tmp2,$tmp2
186 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
188 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
190 $code.=<<___ if ($i==12);
194 $code.=<<___ if ($i==15);
195 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
196 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
198 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
199 sllx @pair[0],$tmp0,$tmp1
200 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
201 srlx @pair[2],$tmp32,@pair[1]
203 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
204 or @pair[1],$tmp2,$tmp2
205 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
207 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
208 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
209 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
210 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
216 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
221 $code.="\tadd $h,$T1,$T1\n";
225 $SRL $e,@Sigma1[0],$h !! $i
227 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
229 $SRL $e,@Sigma1[1],$tmp0
231 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
233 $SRL $e,@Sigma1[2],$tmp0
235 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
237 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
238 xor $tmp1,$h,$tmp0 ! Sigma1(e)
240 $SRL $a,@Sigma0[0],$h
242 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
243 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
245 $SRL $a,@Sigma0[1],$tmp0
247 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
249 $SRL $a,@Sigma0[2],$tmp0
251 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
253 xor $tmp1,$h,$h ! Sigma0(a)
258 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
259 add $tmp2,$T1,$T1 ! +=K[$i]
274 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
276 $xi=@X[(($i+1)/2)%8];
279 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
280 sll $xi,`32-@sigma0[2]`,$tmp1
281 srl $xi,@sigma0[1],$tmp0
283 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
285 srl $xi,@sigma0[2],$tmp0
289 $xi=@X[(($i+14)/2)%8];
292 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
295 srl $xi,@sigma1[0],$tmp2
296 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
297 sll $xi,`32-@sigma1[2]`,$tmp1
298 srl $xi,@sigma1[1],$tmp0
299 xor $tmp1,$tmp2,$tmp2
300 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
301 xor $tmp0,$tmp2,$tmp2
302 srl $xi,@sigma1[2],$tmp0
303 xor $tmp1,$tmp2,$tmp2
308 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
309 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
310 srl @X[($i/2)%8],0,$tmp0
311 add $tmp2,$tmp1,$tmp1
312 add $xi,$T1,$T1 ! +=X[i]
313 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
317 or $T1,@X[($i/2)%8],@X[($i/2)%8]
320 $xi=@X[(($i+9)/2)%8];
322 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
323 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
324 add $xi,$T1,$T1 ! +=X[i+9]
325 add $tmp2,$tmp1,$tmp1
326 srl @X[($i/2)%8],0,@X[($i/2)%8]
330 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
339 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
342 sllx %l2,32,$tmp0 !! Xupdate($i)
345 srlx $tmp0,@sigma0[0],$T1
346 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
347 sllx $tmp0,`64-@sigma0[2]`,$tmp1
348 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
349 srlx $tmp0,@sigma0[1],$tmp0
351 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
353 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
356 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
359 srlx $tmp2,@sigma1[0],$tmp1
360 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
361 sllx $tmp2,`64-@sigma1[2]`,$tmp0
362 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
363 srlx $tmp2,@sigma1[1],$tmp2
364 xor $tmp0,$tmp1,$tmp1
365 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
366 xor $tmp2,$tmp1,$tmp1
367 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
368 xor $tmp0,$tmp1,$tmp1
370 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
371 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
373 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
377 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
379 add $tmp0,$T1,$T1 ! +=X[$i+9]
380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
381 add $tmp2,$T1,$T1 ! +=X[$i]
382 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
388 #include "sparc_arch.h"
391 .register %g2,#scratch
392 .register %g3,#scratch
395 .section ".text",#alloc,#execinstr
399 .type K${label},#object
403 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
404 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
405 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
406 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
407 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
408 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
409 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
410 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
411 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
412 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
413 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
414 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
415 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
416 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
417 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
418 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
422 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
423 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
424 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
425 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
426 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
427 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
428 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
429 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
430 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
431 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
432 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
433 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
434 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
435 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
436 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
437 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
438 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
439 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
440 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
441 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
442 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
443 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
444 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
445 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
446 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
447 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
448 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
449 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
450 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
451 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
452 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
453 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
454 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
455 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
456 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
457 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
458 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
459 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
460 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
461 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
465 .size K${label},.-K${label}
471 .globl sha${label}_block_data_order
473 sha${label}_block_data_order:
474 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
475 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
477 andcc %g1, CFR_SHA${label}, %g0
481 $code.=<<___ if ($SZ==8); # SHA512
482 ldd [%o0 + 0x00], %f0 ! load context
483 ldd [%o0 + 0x08], %f2
484 ldd [%o0 + 0x10], %f4
485 ldd [%o0 + 0x18], %f6
486 ldd [%o0 + 0x20], %f8
487 ldd [%o0 + 0x28], %f10
489 ldd [%o0 + 0x30], %f12
490 bne,pn %icc, .Lhwunaligned
491 ldd [%o0 + 0x38], %f14
494 ldd [%o1 + 0x00], %f16
495 ldd [%o1 + 0x08], %f18
496 ldd [%o1 + 0x10], %f20
497 ldd [%o1 + 0x18], %f22
498 ldd [%o1 + 0x20], %f24
499 ldd [%o1 + 0x28], %f26
500 ldd [%o1 + 0x30], %f28
501 ldd [%o1 + 0x38], %f30
502 ldd [%o1 + 0x40], %f32
503 ldd [%o1 + 0x48], %f34
504 ldd [%o1 + 0x50], %f36
505 ldd [%o1 + 0x58], %f38
506 ldd [%o1 + 0x60], %f40
507 ldd [%o1 + 0x68], %f42
508 ldd [%o1 + 0x70], %f44
509 subcc %o2, 1, %o2 ! done yet?
510 ldd [%o1 + 0x78], %f46
512 prefetch [%o1 + 63], 20
513 prefetch [%o1 + 64+63], 20
515 .word 0x81b02860 ! SHA512
517 bne,pt SIZE_T_CC, .Lhwaligned_loop
521 std %f0, [%o0 + 0x00] ! store context
522 std %f2, [%o0 + 0x08]
523 std %f4, [%o0 + 0x10]
524 std %f6, [%o0 + 0x18]
525 std %f8, [%o0 + 0x20]
526 std %f10, [%o0 + 0x28]
527 std %f12, [%o0 + 0x30]
529 std %f14, [%o0 + 0x38]
533 alignaddr %o1, %g0, %o1
535 ldd [%o1 + 0x00], %f18
537 ldd [%o1 + 0x08], %f20
538 ldd [%o1 + 0x10], %f22
539 ldd [%o1 + 0x18], %f24
540 ldd [%o1 + 0x20], %f26
541 ldd [%o1 + 0x28], %f28
542 ldd [%o1 + 0x30], %f30
543 ldd [%o1 + 0x38], %f32
544 ldd [%o1 + 0x40], %f34
545 ldd [%o1 + 0x48], %f36
546 ldd [%o1 + 0x50], %f38
547 ldd [%o1 + 0x58], %f40
548 ldd [%o1 + 0x60], %f42
549 ldd [%o1 + 0x68], %f44
550 ldd [%o1 + 0x70], %f46
551 ldd [%o1 + 0x78], %f48
552 subcc %o2, 1, %o2 ! done yet?
553 ldd [%o1 + 0x80], %f50
555 prefetch [%o1 + 63], 20
556 prefetch [%o1 + 64+63], 20
558 faligndata %f18, %f20, %f16
559 faligndata %f20, %f22, %f18
560 faligndata %f22, %f24, %f20
561 faligndata %f24, %f26, %f22
562 faligndata %f26, %f28, %f24
563 faligndata %f28, %f30, %f26
564 faligndata %f30, %f32, %f28
565 faligndata %f32, %f34, %f30
566 faligndata %f34, %f36, %f32
567 faligndata %f36, %f38, %f34
568 faligndata %f38, %f40, %f36
569 faligndata %f40, %f42, %f38
570 faligndata %f42, %f44, %f40
571 faligndata %f44, %f46, %f42
572 faligndata %f46, %f48, %f44
573 faligndata %f48, %f50, %f46
575 .word 0x81b02860 ! SHA512
577 bne,pt SIZE_T_CC, .Lhwunaligned_loop
578 for %f50, %f50, %f18 ! %f18=%f50
583 $code.=<<___ if ($SZ==4); # SHA256
592 bne,pn %icc, .Lhwunaligned
596 ldd [%o1 + 0x00], %f8
597 ldd [%o1 + 0x08], %f10
598 ldd [%o1 + 0x10], %f12
599 ldd [%o1 + 0x18], %f14
600 ldd [%o1 + 0x20], %f16
601 ldd [%o1 + 0x28], %f18
602 ldd [%o1 + 0x30], %f20
603 subcc %o2, 1, %o2 ! done yet?
604 ldd [%o1 + 0x38], %f22
606 prefetch [%o1 + 63], 20
608 .word 0x81b02840 ! SHA256
610 bne,pt SIZE_T_CC, .Lhwloop
614 st %f0, [%o0 + 0x00] ! store context
626 alignaddr %o1, %g0, %o1
628 ldd [%o1 + 0x00], %f10
630 ldd [%o1 + 0x08], %f12
631 ldd [%o1 + 0x10], %f14
632 ldd [%o1 + 0x18], %f16
633 ldd [%o1 + 0x20], %f18
634 ldd [%o1 + 0x28], %f20
635 ldd [%o1 + 0x30], %f22
636 ldd [%o1 + 0x38], %f24
637 subcc %o2, 1, %o2 ! done yet?
638 ldd [%o1 + 0x40], %f26
640 prefetch [%o1 + 63], 20
642 faligndata %f10, %f12, %f8
643 faligndata %f12, %f14, %f10
644 faligndata %f14, %f16, %f12
645 faligndata %f16, %f18, %f14
646 faligndata %f18, %f20, %f16
647 faligndata %f20, %f22, %f18
648 faligndata %f22, %f24, %f20
649 faligndata %f24, %f26, %f22
651 .word 0x81b02840 ! SHA256
653 bne,pt SIZE_T_CC, .Lhwunaligned_loop
654 for %f26, %f26, %f10 ! %f10=%f26
662 save %sp,-STACK_FRAME-$locals,%sp
663 and $inp,`$align-1`,$tmp31
664 sllx $len,`log(16*$SZ)/log(2)`,$len
665 andn $inp,`$align-1`,$inp
669 $code.=<<___ if ($SZ==8); # SHA512
671 sub $tmp32,$tmp31,$tmp32
675 add %o7,K${label}-.Lpic,$Ktbl
677 $LD [$ctx+`0*$SZ`],$A
678 $LD [$ctx+`1*$SZ`],$B
679 $LD [$ctx+`2*$SZ`],$C
680 $LD [$ctx+`3*$SZ`],$D
681 $LD [$ctx+`4*$SZ`],$E
682 $LD [$ctx+`5*$SZ`],$F
683 $LD [$ctx+`6*$SZ`],$G
684 $LD [$ctx+`7*$SZ`],$H
688 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
690 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
692 and $tmp2,0xfff,$tmp2
695 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
698 $code.=<<___ if ($SZ==4); # SHA256
699 $LD [$ctx+`0*$SZ`],@X[0]
700 $LD [$ctx+`1*$SZ`],@X[1]
701 $LD [$ctx+`2*$SZ`],@X[2]
702 $LD [$ctx+`3*$SZ`],@X[3]
703 $LD [$ctx+`4*$SZ`],@X[4]
704 $LD [$ctx+`5*$SZ`],@X[5]
705 $LD [$ctx+`6*$SZ`],@X[6]
706 $LD [$ctx+`7*$SZ`],@X[7]
709 $ST $A,[$ctx+`0*$SZ`]
711 $ST $B,[$ctx+`1*$SZ`]
713 $ST $C,[$ctx+`2*$SZ`]
715 $ST $D,[$ctx+`3*$SZ`]
717 $ST $E,[$ctx+`4*$SZ`]
719 $ST $F,[$ctx+`5*$SZ`]
721 $ST $G,[$ctx+`6*$SZ`]
723 $ST $H,[$ctx+`7*$SZ`]
725 $code.=<<___ if ($SZ==8); # SHA512
726 ld [$ctx+`0*$SZ+0`],%l0
727 ld [$ctx+`0*$SZ+4`],%l1
728 ld [$ctx+`1*$SZ+0`],%l2
729 ld [$ctx+`1*$SZ+4`],%l3
730 ld [$ctx+`2*$SZ+0`],%l4
731 ld [$ctx+`2*$SZ+4`],%l5
732 ld [$ctx+`3*$SZ+0`],%l6
735 ld [$ctx+`3*$SZ+4`],%l7
741 $ST $A,[$ctx+`0*$SZ`]
743 $ST $B,[$ctx+`1*$SZ`]
748 $ST $C,[$ctx+`2*$SZ`]
750 $ST $D,[$ctx+`3*$SZ`]
752 ld [$ctx+`4*$SZ+0`],%l0
753 ld [$ctx+`4*$SZ+4`],%l1
754 ld [$ctx+`5*$SZ+0`],%l2
755 ld [$ctx+`5*$SZ+4`],%l3
756 ld [$ctx+`6*$SZ+0`],%l4
757 ld [$ctx+`6*$SZ+4`],%l5
758 ld [$ctx+`7*$SZ+0`],%l6
761 ld [$ctx+`7*$SZ+4`],%l7
767 $ST $E,[$ctx+`4*$SZ`]
769 $ST $F,[$ctx+`5*$SZ`]
774 $ST $G,[$ctx+`6*$SZ`]
776 $ST $H,[$ctx+`7*$SZ`]
779 add $inp,`16*$SZ`,$inp ! advance inp
782 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
786 .type sha${label}_block_data_order,#function
787 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
788 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
792 # Purpose of these subroutines is to explicitly encode VIS instructions,
793 # so that one can compile the module without having to specify VIS
794 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
795 # Idea is to reserve for option to produce "universal" binary and let
796 # programmer detect if current CPU is VIS capable at run-time.
798 my ($mnemonic,$rs1,$rs2,$rd)=@_;
800 my %visopf = ( "faligndata" => 0x048,
803 $ref = "$mnemonic\t$rs1,$rs2,$rd";
805 if ($opf=$visopf{$mnemonic}) {
806 foreach ($rs1,$rs2,$rd) {
807 return $ref if (!/%f([0-9]{1,2})/);
810 return $ref if ($1&1);
811 # re-encode for upper double register addressing
816 return sprintf ".word\t0x%08x !%s",
817 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
824 my ($mnemonic,$rs1,$rs2,$rd)=@_;
825 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
826 my $ref="$mnemonic\t$rs1,$rs2,$rd";
828 foreach ($rs1,$rs2,$rd) {
829 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
830 else { return $ref; }
832 return sprintf ".word\t0x%08x !%s",
833 0x81b00300|$rd<<25|$rs1<<14|$rs2,
837 foreach (split("\n",$code)) {
838 s/\`([^\`]*)\`/eval $1/ge;
840 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
843 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
844 &unalignaddr($1,$2,$3,$4)